nlp.bib

%% Bibliography for NLP (natural language processing), as applied to
%% software engineering tasks.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Criticisms of Tellina's accuracy
%%%

@InProceedings{NeurIPS-2020-NLC2CMD-Competition,
  title = 	 {{NeurIPS} 2020 {NLC2CMD} Competition: Translating Natural Language to {Bash} Commands},
  author =       {Agarwal, Mayank and Chakraborti, Tathagata and Fu, Quchen and Gros, David and Lin, Xi Victoria and Maene, Jaron and Talamadupula, Kartik and Teng, Zhongwei and White, Jules},
  booktitle = 	 {Proceedings of the NeurIPS 2020 Competition and Demonstration Track},
  pages = 	 {302--324},
  year = 	 {2021},
  volume = 	 {133},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {06--12 Dec},
  pdf = 	 {http://proceedings.mlr.press/v133/agarwal21b/agarwal21b.pdf},
  url = 	 {https://proceedings.mlr.press/v133/agarwal21b.html},
  abstract = 	 {The NLC2CMD Competition hosted at NeurIPS 2020 aimed to bring the power of natural language processing to the command line. Participants were tasked with building models that can transform descriptions of command line tasks in English to their Bash syntax. This is a report on the competition with details of the task, metrics, data, attempted solutions, and lessons learned.}
}

@InProceedings{FuTWS2021,
  author = 	 "Fu, Quchen and Teng, Zhongwei and White, Jules and Schmidt, Douglas C.",
  title = 	 "A Transformer-based Approach for Translating Natural Language to {Bash} Commands",
  booktitle = "2021 20th IEEE International Conference on Machine Learning and Applications (ICMLA)",
  year = 	 2021,
  pages = 	 "1245-1248",
}


@InProceedings{ChenHLO2020,
  author = 	 "Chen, Yan and Herskovitz, Jaylin and Lasecki, Walter S. and Oney, Steve",
  title = 	 "Bashon: A Hybrid Crowd-Machine Workflow for Shell Command Synthesis",
  booktitle = "2020 IEEE Symposium on Visual Languages and Human-Centric Computing (VL/HCC)",
  year = 	 2020,
  pages = 	 "1-8",
  doi={10.1109/VL/HCC50065.2020.9127248}
}

@InProceedings{ZhangLXTZLZ2022,
  author = 	 "Neng Zhang and Chao Liu and Xin Xia and Christoph Treude and Ying Zou and David Lo and Zibin Zheng",
  title = 	 "{ShellFusion}: Answer Generation for Shell Programming Tasks via Knowledge Fusion",
  crossref =  "ICSE2022",
  NEEDpages = 	 "*",
}

@InProceedings{KanCW2020,
  author = 	 "Kan, Jia-Wei and Chien, Wei-Chin and Wang, Sheng-De",
  title = 	 "Grid Structure Attention for Natural Language Interface to {Bash} Commands",
  booktitle = "2020 International Computer Symposium (ICS)",
  year = 	 2020,
  pages = 	 "67-72",
  doi={10.1109/ICS51289.2020.00023},
}

@InProceedings{KumarNSAS2019,
  author = 	 "Kumar, NS and Nagalakshmi, Malathy and Sharma, Tanya and Ambati, Sai Bhavana and Satyanarayana, Vibha",
  title = 	 "Natural Language Interface to {Linux} Shell – Report",
  booktitle = "2019 3rd International Conference on Computing and Communications Technologies (ICCCT)",
  year = 	 2019,
  pages = 	 "24-30",
  doi={10.1109/ICCCT2.2019.8824800},
}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Participants in NLC2CMD competition
%%%


@inproceedings{VaswaniSPUJGKP2017,
 author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, \L ukasz and Polosukhin, Illia},
 booktitle = {Advances in Neural Information Processing Systems},
 NEEDpages = {},
 title = {Attention is All you Need},
 url = {https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf},
 volume = {30},
 year = {2017}
}

@InProceedings{TangMRS2018,
  author = 	 "Gongbo Tang and Mathias M{\"{u}}ller and Annette Rios and Rico Sennrich",
  title = 	 "Why Self-Attention? {A} Targeted Evaluation of Neural Machine Translation Architectures",
  booktitle = "2018 Conference on Empirical Methods in Natural Language
Processing",
  year = 	 2018,
  address = 	 "Brussels, Belgium",
}

@Misc{Gros2019,
  author = 	 "David Gros",
  title = 	 "{AInix}: An Open Platform for Natural Language Interfaces to Shell Commands",
  month = 	 may,
  year = 	 2019,
  note = 	 "Undergraduate Honors Thesis, Computer Science Department, University of Texas at Austin",
  url="http://www.cs.utexas.edu/users/ai-labpub-view.php?PubID=127814",
}

@TechReport{RadfordWCLAS2019,
  author = 	 "Alec Radford and Jeffrey Wu and Rewon Child and David Luan and Dario Amodei and Ilya Sutskever",
  title = 	 "Language models are unsupervised multitask learners",
  institution =  "OpenAI",
  year = 	 2019,
  url = "http://www.persagen.com/files/misc/radford2019language.pdf",
}

@TechReport{LinvinovMPKO2020,
  author = 	 "Denis Litvinov and Gleb Morgachev and Artem Popov and Nikolai Korolev and Dmitrii Orekhov",
  title = 	 "{NLC2CMD} Report from {JB} Team",
  institution =  "JetBrains",
  year = 	 2020,
  month = 	 dec,
  url = "https://github.com/JetBrains/nlc2cmd/blob/master/report.pdf",
}

@Misc{KangY2020,
  author = 	 "Sungmin Kang and Juyeon Yoon",
  title = 	 "Hierarchical Decoding of {Bash} Commands",
  year = 	 2020,
  note = 	 "Talk at NeurIPS 2020",
  url =          "https://slideslive.com/38942503/hierarchical-decoder-for-bash-commands",
}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% NLP to command line (bash) tools
%%%

@inproceedings{NEURIPS2020_1457c0d6,
 author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
 booktitle = {Advances in Neural Information Processing Systems (NeurIPS 2020)},
 pages = {1877--1901},
 title = {Language Models are Few-Shot Learners},
 url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
 volume = {33},
 year = {2020}
}

@Article{LiWYT2019,
  author = 	 "Hao Li and Yu-Ping Wang and Jie Yin and Gang Tan",
  title = 	 "{SmartShell}: Automated Shell Scripts Synthesis from Natural Language",
  journal = 	 "International Journal of Software Engineering and Knowledge Engineering",
  year = 	 2019,
  volume = 	 29,
  number = 	 02,
  pages = 	 "197-220",
  doi = "https://doi.org/10.1142/S0218194019500098",
}

@InProceedings{CLAI-NeurIPS2019-demonstration,
  author = 	 "Mayank Agarwal and Jorge Barroso Carmona and Tathagata Chakraborti and Eli M. Dow and Kshitij P. Fadnis and Borja Godoy and Kartik Talamadupula",
  title = 	 "Project {CLAI} --- Bringing {AI} to the Command Line Interface",
  booktitle = "NeurIPS 2019 Demonstration Track",
  year = 	 2019,
}

@article{Agarwal2020ProjectCI,
  title={Project CLAI: Instrumenting the Command Line as a New Environment for AI Agents},
  author={Mayank Agarwal and Jorge J. Barroso and Tathagata Chakraborti and Eli M. Dow and Kshitij P. Fadnis and Borja Godoy and Madhavan Pallan and Kartik Talamadupula},
  journal={arXiv: Human-Computer Interaction},
  year={2020}
}

@Misc{CLAI-arxiv-2002.00762,
  author    = {Mayank Agarwal and
               Jorge J. Barroso and
               Tathagata Chakraborti and
               Eli M. Dow and
               Kshitij P. Fadnis and
               Borja Godoy and
               Kartik Talamadupula},
  title     = {{CLAI:} {A} Platform for {AI} Skills on the Command Line},
  howpublished = {https://arxiv.org/abs/2002.00762},
  url       = {https://arxiv.org/abs/2002.00762},
  month = 	 jun,
  year = 	 2020,
  note = 	 "v2",
}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% NLP to test assertion tools
%%%


@InProceedings{DinellaRML2022,
  author = 	 "Dinella, Elizabeth and Ryan, Gabriel and Mytkowicz, Todd and Lahiri, Shuvendu K.",
  title = 	 "{TOGA}: a neural method for test oracle generation",
  crossref =  "ICSE2022",
  pages = 	 "2130-2141",
}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Models of code
%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% LLMs
%%%

@inproceedings{10.1145/3491101.3519665,
author = {Vaithilingam, Priyan and Zhang, Tianyi and Glassman, Elena L.},
title = {Expectation vs.\ Experience: Evaluating the Usability of Code Generation Tools Powered by Large Language Models},
year = {2022},
isbn = {9781450391566},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3491101.3519665},
doi = {10.1145/3491101.3519665},
abstract = {Recent advances in Large Language Models (LLM) have made automatic code generation possible for real-world programming tasks in general-purpose programming languages such as Python. However, there are few human studies on the usability of these tools and how they fit the programming workflow. In this work, we conducted a within-subjects user study with 24 participants to understand how programmers use and perceive Copilot, a LLM-based code generation tool. We found that, while Copilot did not necessarily improve the task completion time or success rate, most participants preferred to use Copilot in daily programming tasks, since Copilot often provided a useful starting point and saved the effort of searching online. However, participants did face difficulties in understanding, editing, and debugging code snippets generated by Copilot, which significantly hindered their task-solving effectiveness. Finally, we highlighted several promising directions for improving the design of Copilot based on our observations and participants’ feedback.},
booktitle = {Extended Abstracts of the 2022 CHI Conference on Human Factors in Computing Systems},
articleno = {332},
numpages = {7},
keywords = {github copilot, large language model},
location = {New Orleans, LA, USA},
series = {CHI EA '22}
}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Other
%%%

@InProceedings{TanYKZ2007,
  author = 	 "Tan, Lin and Yuan, Ding and Krishna, Gopal and Zhou, Yuanyuan",
  title = 	 "/*{iComment}: Bugs or Bad Comments?*/",
  crossref =     "SOSP2007",
  pages = 	 "145--158",
}


@InProceedings{TanZP2011,
  author = 	 "Tan, Lin and Zhou, Yuanyuan and Padioleau, Yoann",
  title = 	 "{aComment}: Mining annotations from comments and code to detect interrupt related concurrency bugs",
  crossref =  "ICSE2011",
  pages = 	 "11--20",
}


@InProceedings{YeSMBL2016,
  author =       "Xin Ye and Hui Shen and Xiao Ma and Razvan Bunescu and Chang Liu",
  title =        "From word embeddings to document similarities for improved information retrieval in software engineering",
  crossref =     "ICSE2016",
  NEEDpages =     "*",
}


@InProceedings{AllamanisBBS2014,
  author = 	 "Allamanis, Miltiadis and Barr, Earl T. and Bird, Christian and Sutton, Charles",
  title = 	 "Learning natural coding conventions",
  crossref =     "FSE2014",
  pages = 	 "281--293",
}


@InProceedings{PanditaXZXOP2012,
  author = 	 "Pandita, Rahul and Xiao, Xusheng and Zhong, Hao and Xie, Tao and Oney, Stephen and Paradkar, Amit",
  title = 	 "Inferring method specifications from natural language {API} descriptions",
  crossref =     "ICSE2012",
  pages = 	 "815--825",
}


@InProceedings{HindleBSGD2012,
  author = 	 "Hindle, Abram and Barr, Earl T. and Su, Zhendong and Gabel, Mark and Devanbu, Premkumar",
  title = 	 "On the Naturalness of Software",
  crossref =     "ICSE2012",
  pages = 	 "837--847",
}


@InProceedings{HowardGPVS2013,
  author = 	 "Howard, Matthew J. and Gupta, Samir and Pollock, Lori and Vijay-Shanker, K.",
  title = 	 "Automatically mining software-based, semantically-similar words from comment-code mappings",
  crossref =     "MSR2013",
  pages = 	 "377--386",
}


@InProceedings{GuptaMPVS2013,
  author = 	 "Samir Gupta and Sana Malik and Lori Pollock and K. Vijay-Shanker",
  title = 	 "Part-of-speech tagging of program identifiers for improved text-based software engineering tools",
  crossref =     "ICPC2013",
  pages = 	 "3--12",
}


@InProceedings{SridharaHMPVS2010,
  author = 	 "Sridhara, Giriprasad and Hill, Emily and Muppaneni, Divya and Pollock, Lori and Vijay-Shanker, K.",
  title = 	 "Towards automatically generating summary comments for {Java} methods",
  crossref =     "ASE2010",
  pages = 	 "43--52",
}


@InProceedings{HillFBSNPV2008,
  author = 	 "Hill, Emily and Fry, Zachary P. and Boyd, Haley and Sridhara, Giriprasad and Novikova, Yana and Pollock, Lori and Vijay-Shanker, K.",
  title = 	 "{AMAP}: Automatically mining abbreviation expansions in programs to enhance software maintenance tools",
  crossref =     "MSR2008",
  pages = 	 "79--88",
}


@InProceedings{ArnaoudovaEOGA2010,
  author = 	 "Arnaoudova, Venera and Eshkevari, Laleh and Oliveto, Rocco and Gueheneuc, Yann-Gael and Antoniol, Giuliano",
  title = 	 "Physical and conceptual identifier dispersion: Measures and relation to fault proneness",
  crossref =     "ICSM2010",
  pages = 	 "1--5",
}


@Article{LawrieMFB2007,
  author = 	 "Lawrie, Dawn and Morrell, Christopher and Feild, Henry and Binkley, David",
  title = 	 "Effective identifier names for comprehension and memory",
  journal = 	 "Innovations in Systems and Software Engineering",
  year = 	 2007,
  volume = 	 3,
  number = 	 4,
  pages = 	 "303--318",
  month = 	 dec,
  abstract =
   "Readers of programs have two main sources of domain information:
    identifier names and comments. When functions are uncommented, as many are,
    comprehension is almost exclusively dependent on the identifier
    names. Assuming that writers of programs want to create quality identifiers
    (e.g., identifiers that include relevant domain knowledge), one must ask
    how should they go about it. For example, do the initials of a concept name
    provide enough information to represent the concept? If not, and a longer
    identifier is needed, is an abbreviation satisfactory or does the concept
    need to be captured in an identifier that includes full words? What is the
    effect of longer identifiers on limited short term memory capacity? Results
    from a study designed to investigate these questions are reported. The
    study involved over 100 programmers who were asked to describe 12 different
    functions and then recall identifiers that appeared in each function. The
    functions used three different levels of identifiers: single letters,
    abbreviations, and full words. Responses allow the extent of comprehension
    associated with the different levels to be studied along with their impact
    on memory. The functions used in the study include standard computer
    science textbook algorithms and functions extracted from production
    code. The results show that full-word identifiers lead to the best
    comprehension; however, in many cases, there is no statistical difference
    between using full words and abbreviations. When considered in the light of
    limited human short-term memory, well-chosen abbreviations may be
    preferable in some situations since identifiers with fewer syllables are
    easier to remember.",
}


@Article{DeissenboeckP2006,
  author = 	 "Deissenboeck, Florian and Pizka, Markus",
  title = 	 "Concise and consistent naming",
  journal = 	 "Software Quality Journal",
  year = 	 2006,
  volume = 	 14,
  number = 	 3,
  pages = 	 "261--282",
  month = 	 sep,
}


@InProceedings{MihalceaCS2006,
  author = 	 "Mihalcea, Rada and Corley, Courtney and Strapparava, Carlo",
  title = 	 "Corpus-based and knowledge-based measures of text semantic similarity",
  crossref =     "AAAI2006",
  pages = 	 "775--780",
}


@InProceedings{LawrieMB2010,
  author = 	 "Dawn Lawrie and Christopher Morrell and Dave Binkley",
  title = 	 "Normalizing source code vocabulary",
  crossref =     "WCRE2010",
  pages = 	 "3-12",
}


@InProceedings{MotwaniBrun2019,
  author = 	 "Motwani, Manish and Brun, Yuriy",
  title = 	 "Automatically Generating Precise Oracles from Structured Natural Language Specifications",
  crossref =  "ICSE2019",
  pages = 	 "188--199",
}


@InProceedings{HuLXLJ2018,
  author = 	 "Hu, Xing and Li, Ge and Xia, Xin and Lo, David and Jin, Zhi",
  title = 	 "Deep code comment generation",
  crossref =  "ICPC2018",
  pages = 	 "200--210",
}


@Misc{LouisDBS2018,
  author = 	 "Annie Louis and Santanu Kumar Dash and Earl T. Barr and Charles Sutton",
  title = 	 "Deep Learning to Detect Redundant Method Comments",
  howpublished = "\url{http://arxiv.org/abs/1806.04616}",
  month = 	 jun,
  year = 	 2018,
}


@InProceedings{MikolovSCCD2013,
  author = 	 "Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg and Dean, Jeffrey",
  title = 	 "Distributed representations of words and phrases and their compositionality",
  crossref =  "NIPS2013",
  pages = 	 "3111--3119",
}

@InProceedings{MovshovitzAttiasC2013,
  author = 	 "Movshovitz-Attias, Dana  and  Cohen, William W.",
  title = 	 "Natural language models for predicting programming comments",
  crossref =  "ACL2013short",
  pages = 	 "35--40",
}

@InProceedings{BuzeW2010,
  author = 	 "Buse, Raymond P.L. and Weimer, Westley R.",
  title = 	 "Automatically documenting program changes",
  crossref =  "ASE2010",
  pages = 	 "33-42",
}

@InProceedings{PascarellaB2017,
  author = 	 "Pascarella, Luca and Bacchelli, Alberto",
  title = 	 "Classifying code comments in {Java} open-source software systems",
  crossref =  "MSR2017",
  pages = 	 "227-237",
  supersededby = "PascarellaBB2019"
}

@Article{PascarellaBB2019,
  author = 	 "Pascarella, Luca and Bruntink, Magiel and Bacchelli, Alberto",
  title = 	 "Classifying code comments in {Java} software systems",
  journal = 	 JEmpiricalSE,
  year = 	 2019,
  volume = 	 24,
  number = 	 3,
  pages = 	 "1499-1537",
  month = 	 jun,
}


@InProceedings{DevlinCLT2019,
  author = 	 "Jacob Devlin and Ming-Wei Chang and Kenton Lee and Kristina Toutanova",
  title = 	 "{BERT}: Pre-training of deep bidirectional transformers for language understanding",
  crossref =  "NAACL-HLT2019",
  pages = 	 "4171--4186",
}


%  LocalWords:  InProceedings TanYKZ2007 Gopal Zhou Yuanyuan iComment Iyer pdf
%  LocalWords:  booktitle SOSP2007 SOSP2007date SOSP2007addr Benwen NN pre url
%  LocalWords:  testEntrySetClearChangesMap Srinivasan Ioannis Konstas Xin
%  LocalWords:  testSettingHeightThatIsTooSmallLeavesHeightUnchanged LSTM
%  LocalWords:  Zettlemoyer YeSMBL2016 Shen Xiao Razvan Bunescu Liu MRR Za
%  LocalWords:  ICSE2016 NEEDpages ICSE2016date ICSE2016addr Wiki LSA CCG
%  LocalWords:  stemmer Kushman Barzilay Turkers regex regexes Mise Kiddon
%  LocalWords:  Ganesa Thandavam Ponnuraj Yejin Choi Branavan Miltiadis xj
%  LocalWords:  Allamanis AAAI Briand Briand's Hirschberg uncompelling xk
%  LocalWords:  Movshovitz Attias ICPC preprocess pickaxe xl Convolutional
%  LocalWords:  Hao Peng ie camelcase tokenizer Naturalize's NLC2CMD
% LocalWords:  Agarwal Mayank Chakraborti Tathagata Fu Quchen Gros
% LocalWords:  Maene Jaron Talamadupula Kartik Teng Zhongwei