<?xml version="1.0" encoding="UTF-8"?>
<article article-type="research-article" dtd-version="1.3" xml:lang="ru" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://metafora.rcsi.science/xsd_files/journal3.xsd">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">moitvivt</journal-id>
      <journal-title-group>
        <journal-title xml:lang="ru">Моделирование, оптимизация и информационные технологии</journal-title>
        <trans-title-group xml:lang="en">
          <trans-title>Modeling, Optimization and Information Technology</trans-title>
        </trans-title-group>
      </journal-title-group>
      <issn pub-type="epub">2310-6018</issn>
      <publisher>
        <publisher-name>Издательство</publisher-name>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="doi">10.26102/2310-6018/2022.38.3.016</article-id>
      <article-id pub-id-type="custom" custom-type="elpub">1227</article-id>
      <title-group>
        <article-title xml:lang="ru">Идентификация автора исходного кода программы на основе неоднородных данных для решения задач кибербезопасности</article-title>
        <trans-title-group xml:lang="en">
          <trans-title>Authorship identification of a heterogeneous source code  for the purposes of cybersecurity management</trans-title>
        </trans-title-group>
      </title-group>
      <contrib-group>
        <contrib contrib-type="author" corresp="yes">
          <contrib-id contrib-id-type="orcid">0000-0002-2587-2222</contrib-id>
          <name-alternatives>
            <name name-style="eastern" xml:lang="ru">
              <surname>Романов</surname>
              <given-names>Александр Сергеевич</given-names>
            </name>
            <name name-style="western" xml:lang="en">
              <surname>Romanov</surname>
              <given-names>Aleksandr Sergeevich</given-names>
            </name>
          </name-alternatives>
          <email>alexx.romanov@gmail.com</email>
          <xref ref-type="aff">aff-1</xref>
        </contrib>
        <contrib contrib-type="author" corresp="yes">
          <contrib-id contrib-id-type="orcid">0000-0001-5619-1836</contrib-id>
          <name-alternatives>
            <name name-style="eastern" xml:lang="ru">
              <surname>Куртукова</surname>
              <given-names>Анна Владимировна</given-names>
            </name>
            <name name-style="western" xml:lang="en">
              <surname>Kurtukova</surname>
              <given-names>Anna Vladimirovna</given-names>
            </name>
          </name-alternatives>
          <email>av.kurtukova@gmail.com</email>
          <xref ref-type="aff">aff-2</xref>
        </contrib>
        <contrib contrib-type="author" corresp="yes">
          <contrib-id contrib-id-type="orcid">0000-0003-2393-6701</contrib-id>
          <name-alternatives>
            <name name-style="eastern" xml:lang="ru">
              <surname>Шелупанов</surname>
              <given-names>Александр Александрович</given-names>
            </name>
            <name name-style="western" xml:lang="en">
              <surname>Shelupanov</surname>
              <given-names>Aleksandr Aleksandrovich</given-names>
            </name>
          </name-alternatives>
          <email>saa@fb.tusur.ru</email>
          <xref ref-type="aff">aff-3</xref>
        </contrib>
        <contrib contrib-type="author" corresp="yes">
          <contrib-id contrib-id-type="orcid">0000-0001-7844-4363</contrib-id>
          <name-alternatives>
            <name name-style="eastern" xml:lang="ru">
              <surname>Федотова</surname>
              <given-names>Анастасия Михайловна</given-names>
            </name>
            <name name-style="western" xml:lang="en">
              <surname>Fedotova</surname>
              <given-names>Anastasia Mikhailovna</given-names>
            </name>
          </name-alternatives>
          <email>fedotova.a.747@e.tusur.ru</email>
          <xref ref-type="aff">aff-4</xref>
        </contrib>
      </contrib-group>
      <aff-alternatives id="aff-1">
        <aff xml:lang="ru">Томский государственный университет систем управления и радиоэлектроники</aff>
        <aff xml:lang="en">Tomsk State University of Control Systems and Radioelectronics</aff>
      </aff-alternatives>
      <aff-alternatives id="aff-2">
        <aff xml:lang="ru">Томский государственный университет систем управления и радиоэлектроники</aff>
        <aff xml:lang="en">Tomsk State University of Control Systems and Radioelectronics</aff>
      </aff-alternatives>
      <aff-alternatives id="aff-3">
        <aff xml:lang="ru">Томский государственный университет систем управления и радиоэлектроники</aff>
        <aff xml:lang="en">Tomsk State University of Control Systems and Radioelectronics</aff>
      </aff-alternatives>
      <aff-alternatives id="aff-4">
        <aff xml:lang="ru">Томский государственный университет систем управления и радиоэлектроники</aff>
        <aff xml:lang="en">Tomsk State University of Control Systems and Radioelectronics</aff>
      </aff-alternatives>
      <pub-date pub-type="epub">
        <day>01</day>
        <month>01</month>
        <year>2026</year>
      </pub-date>
      <volume>1</volume>
      <issue>1</issue>
      <elocation-id>10.26102/2310-6018/2022.38.3.016</elocation-id>
      <permissions>
        <copyright-statement>Copyright © Авторы, 2026</copyright-statement>
        <copyright-year>2026</copyright-year>
        <license license-type="creative-commons-attribution" xlink:href="https://creativecommons.org/licenses/by/4.0/">
          <license-p>This work is licensed under a Creative Commons Attribution 4.0 International License</license-p>
        </license>
      </permissions>
      <self-uri xlink:href="https://moitvivt.ru/ru/journal/article?id=1227"/>
      <abstract xml:lang="ru">
        <p>Статья посвящена идентификации автора неоднородного исходного кода программы на основе гибридной нейронной сети. Решения данной проблемы особенно актуальны для областей информационной безопасности, образовательного процесса и защиты авторского права. В статье представлен анализ современных методов решения поставленной задачи. Авторами предлагается собственная методика на основе гибридной нейронной сети, зарекомендовавшей себя в ранних исследованиях, направленных на оценку эффективности данного подхода в простых и сложных случаях.&#13;
Данная работа включает в себя эксперименты по ранее не рассмотренным случаям идентификации автора исходного кода на основе неоднородных данных. Рассматриваются случаи, актуальные для корпоративной разработки. Среди них анализ исходных кодов, представленных в виде коммитов, и обучение модели на наборах данных, включающих в себя два и более языка программирования. Также исследовано набирающее популярность направление определения авторства искусственно сгенерированного исходного кода. Для каждого случая сформирован набор данных и проведен эксперимент. &#13;
Оценка эффективности авторской методики для всех трех сложных случаев осуществлена при помощи перекрестной проверки по 10 блокам. Средняя точность для смешанных наборов данных составила 87 % для двух языков программирования и 76 % для трех и более языков, соответственно. Точность методики для решения задачи определения авторства искусственно сгенерированных исходных кодов в среднем составила 81,5 %. Идентификация автора исходного кода программы на основе коммитов осуществлялась с точностью 84 %. Эксперименты показали, что во всех трех случаях эффективность методики может быть повышена путем использования больших объемов обучающих данных.</p>
      </abstract>
      <trans-abstract xml:lang="en">
        <p>The article is devoted to the issue of identifying the author of a heterogeneous source code program by means of a hybrid neural network. The solutions to this problem are especially relevant to the fields of information security, educational process, and copyright protection. The article analyzes modern methods of addressing this problem. The authors propose their own methodology based on a proven in early studies hybrid neural network aimed at evaluating the effectiveness of this approach in simple and difficult cases. &#13;
This research incorporates experiments on previously unconsidered cases of source code author identification based on heterogeneous data. Cases relevant to corporate development are examined including the analysis of source codes presented as commits and model training on datasets with more than two programming languages. Additionally, the trend of determining the authorship of an artificially generated source code, which is gaining traction, is regarded. A dataset was generated, and an appropriate experiment was performed for each case.&#13;
The effectiveness of the author's methodology for all three difficult cases was evaluated using a 10 blocks cross-validation. The average accuracy for mixed datasets was 87 % for two programming languages and 76 % for three or more languages, respectively. The average accuracy of the methodology for authorship identification of artificially generated source codes was 81.5 %. Identification of the author of a program source code based on commits was carried out with an accuracy of 84 %. Experiments have shown that the effectiveness of the methodology can be improved in all three cases by using large amounts of training data.</p>
      </trans-abstract>
      <kwd-group xml:lang="ru">
        <kwd>авторство</kwd>
        <kwd>исходный код</kwd>
        <kwd>коммиты</kwd>
        <kwd>генерация</kwd>
        <kwd>нейронная сеть</kwd>
      </kwd-group>
      <kwd-group xml:lang="en">
        <kwd>authorship</kwd>
        <kwd>source code</kwd>
        <kwd>commits</kwd>
        <kwd>generation</kwd>
        <kwd>neural network</kwd>
      </kwd-group>
      <funding-group>
        <funding-statement xml:lang="ru">Работа выполнена при финансовой поддержке Министерства науки и высшего образования РФ в рамках базовой части государственного задания ТУСУРа на 2020–2022 гг. (проект № FEWM-2020-0037).</funding-statement>
        <funding-statement xml:lang="en">The research was supported by the grant of the Ministry of Science and Higher Education of the Russian Federation under the core part of TUSUR state task for 2020–2022 (project No. FEWM-2020-0037). </funding-statement>
      </funding-group>
    </article-meta>
  </front>
  <back>
    <ref-list>
      <title>References</title>
      <ref id="cit1">
        <label>1</label>
        <mixed-citation xml:lang="ru">Куртукова А.В., Романов А.С. Идентификация автора исходного кода методами машинного обучения. Труды СПИИРАН. 2019;18(3):741–765.</mixed-citation>
      </ref>
      <ref id="cit2">
        <label>2</label>
        <mixed-citation xml:lang="ru">Kurtukova A., Romanov A., Shelupanov A. Source Code Authorship Identification Using Deep Neural Networks. Symmetry. 2020;12:2044.</mixed-citation>
      </ref>
      <ref id="cit3">
        <label>3</label>
        <mixed-citation xml:lang="ru">Abuhamad M., AbuHmed T., Mohaisen A., Nyang D. Large-Scale and Language-Oblivious Code Authorship Identification. In Proceedings of the 2018 ACM SIGSAC Conference on Computer and Communications Security. 2018;101–114.</mixed-citation>
      </ref>
      <ref id="cit4">
        <label>4</label>
        <mixed-citation xml:lang="ru">Zhen L., Chen G., Chen C., Zou Y., Xu S. RoPGen: Towards Robust Code Authorship Attribution via Automatic Coding Style Transformation. 2022 IEEE 44th International Conference on Software Engineering (ICSE). 2022;1906–1918.</mixed-citation>
      </ref>
      <ref id="cit5">
        <label>5</label>
        <mixed-citation xml:lang="ru">Holland C., Khoshavi N., Jaimes L.G. Code authorship identification via deep graph CNNs. In Proceedings of the 2022 ACM Southeast Conference (ACM SE '22). 2022;144–150.</mixed-citation>
      </ref>
      <ref id="cit6">
        <label>6</label>
        <mixed-citation xml:lang="ru">Bogdanova A., Romanov V. Explainable source code authorship attribution algorithm. Journal of Physics: Conference Series. 2021;2134:012011. DOI: 10.1088/1742-6596/2134/1/012011.</mixed-citation>
      </ref>
      <ref id="cit7">
        <label>7</label>
        <mixed-citation xml:lang="ru">Bogdanova A. Source code authorship attribution using file embeddings. Companion Proceedings of the 2021 ACM SIGPLAN International Conference on Systems, Programming, Languages, and Applications: Software for Humanity. 2021;31–33.</mixed-citation>
      </ref>
      <ref id="cit8">
        <label>8</label>
        <mixed-citation xml:lang="ru">Bogomolov E., Kovalenko V., Rebryk Y., Bacchelli A., Bryksin T. Authorship attribution of source code: a language-agnostic approach and ap-plicability in software engineering. In Proceedings of the 29th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering. 2021;932–944.</mixed-citation>
      </ref>
      <ref id="cit9">
        <label>9</label>
        <mixed-citation xml:lang="ru">Romanov A., Kurtukova A., Fedotova A., Meshcheryakov R. Natural Text Anonymization Using Universal Transformer with a Self-attention. Proceedings of the III International Conference on Language Engineering and Applied Linguistics. 2019;22–37</mixed-citation>
      </ref>
      <ref id="cit10">
        <label>10</label>
        <mixed-citation xml:lang="ru">Caliskan-Islam A. Deanonymizing programmers via code stylometry. Proceedings of the 24th USENIX Security Symposium. 2015;255–270.</mixed-citation>
      </ref>
      <ref id="cit11">
        <label>11</label>
        <mixed-citation xml:lang="ru">GitHub. Доступно по: https://GitHub.com/ (дата обращения: 14.08.2022).</mixed-citation>
      </ref>
      <ref id="cit12">
        <label>12</label>
        <mixed-citation xml:lang="ru">GitLab. Доступно по: https://gitlab.com/ (дата обращения: 14.08.2022).</mixed-citation>
      </ref>
      <ref id="cit13">
        <label>13</label>
        <mixed-citation xml:lang="ru">Rothe S., Narayan S., Severyn A. Leveraging pre-trained checkpoints for sequence generation tasks. Transactions of the Association for Computational Linguistics. 2020;8:264–280.</mixed-citation>
      </ref>
      <ref id="cit14">
        <label>14</label>
        <mixed-citation xml:lang="ru">Du Z. All NLP tasks are generation tasks: A general pretraining framework. arXiv preprint arXiv:2103.10360. 2021.</mixed-citation>
      </ref>
      <ref id="cit15">
        <label>15</label>
        <mixed-citation xml:lang="ru">Floridi L., Chiriatti M. GPT-3: Its nature, scope, limits, and consequences. Minds and Machines. 2020;30(4):681–694.</mixed-citation>
      </ref>
      <ref id="cit16">
        <label>16</label>
        <mixed-citation xml:lang="ru">Lee J. S., Hsiang J. Patent claim generation by fine-tuning OpenAI GPT-2. World Patent Information. 2020;62:101983.</mixed-citation>
      </ref>
      <ref id="cit17">
        <label>17</label>
        <mixed-citation xml:lang="ru">Душейко А. Генерация лидов новостных текстов с помощью нейронной сети ruGPT-3: магистерская диссертация по направлению подготовки: 45.04.03 Фундаментальная и прикладная лингвистика. 2022.</mixed-citation>
      </ref>
      <ref id="cit18">
        <label>18</label>
        <mixed-citation xml:lang="ru">Pisarevskaya D., Shavrina T. WikiOmnia: generative QA corpus on the whole Russian Wikipedia. arXiv preprint arXiv:2204.08009. 2022.</mixed-citation>
      </ref>
      <ref id="cit19">
        <label>19</label>
        <mixed-citation xml:lang="ru">Li Z. RoPGen: towards robust code authorship attribution via automatic coding style transformation. 2022 IEEE/ACM 44th International Conference on Software Engineering (ICSE). IEEE. 2022;1906–1918.</mixed-citation>
      </ref>
      <ref id="cit20">
        <label>20</label>
        <mixed-citation xml:lang="ru">Cruz-Benito J. Automated source code generation and auto-completion using deep learning: Comparing and discussing current language model-related approaches. AI. 2021;2(1):1–16.</mixed-citation>
      </ref>
      <ref id="cit21">
        <label>21</label>
        <mixed-citation xml:lang="ru">Open AI. Доступно по: https://openai.com/blog/openai-codex (дата обращения 14.08.2022).</mixed-citation>
      </ref>
      <ref id="cit22">
        <label>22</label>
        <mixed-citation xml:lang="ru">GitHub Copilot. Доступно по: https://copilot.GitHub.com (дата обращения 14.08.2022).</mixed-citation>
      </ref>
      <ref id="cit23">
        <label>23</label>
        <mixed-citation xml:lang="ru">AlphaCode. Доступно по: https://deepmind.com/blog/article/Competitive-programming-with-AlphaCode (дата обращения: 14.08.2022).</mixed-citation>
      </ref>
      <ref id="cit24">
        <label>24</label>
        <mixed-citation xml:lang="ru">Sber AI ruGPT-3. Доступно по: https://developers.sber.ru/portal/tools/rugpt-3 (дата обращения: 14.08.2022).</mixed-citation>
      </ref>
      <ref id="cit25">
        <label>25</label>
        <mixed-citation xml:lang="ru">PolyCoder. Доступно по: https://venturebeat.com/2022/03/04/researchers-open-source-code-generating-ai-they-claim-can-beat-openais-codex/ (дата обращения: 14.08.2022).</mixed-citation>
      </ref>
      <ref id="cit26">
        <label>26</label>
        <mixed-citation xml:lang="ru">Frantzeskou G., Stamatatos E., Gritzalis S. Identifying authorship by bytelevel n-grams: The source code author profile (SCAP) method. Int. J. Digit. Evid. 2007;1:1–18.</mixed-citation>
      </ref>
      <ref id="cit27">
        <label>27</label>
        <mixed-citation xml:lang="ru">Wisse W., Veenman C.J. Scripting DNA: Identifying the JavaScript Programmer. Digit. Investig. 2015;15:61–71.</mixed-citation>
      </ref>
    </ref-list>
    <fn-group>
      <fn fn-type="conflict">
        <p>The authors declare that there are no conflicts of interest present.</p>
      </fn>
    </fn-group>
  </back>
</article>