diff --git a/docs/66_arxiv_agent/multiagent_write_review.ipynb b/docs/66_arxiv_agent/multiagent_write_review.ipynb index 9058adc..f739c24 100644 --- a/docs/66_arxiv_agent/multiagent_write_review.ipynb +++ b/docs/66_arxiv_agent/multiagent_write_review.ipynb @@ -17,7 +17,7 @@ "**Note:** For technical reasons, we only read the abstract. \n", "Of course, as in read life, it would be better to read the entire paper, but this exceeds token limits of SOTA open-weight LLMs.\n", "\n", - "This notebook will only work with an upcoming release of smolagents (> 0.1.3), after this PR is merged: https://github.com/huggingface/smolagents/pull/12" + "This notebook will only work with an upcoming release of smolagents (> 0.1.3)." ] }, { @@ -39,6 +39,7 @@ "source": [ "from IPython.display import display, Markdown\n", "from smolagents.agents import ToolCallingAgent, CodeAgent\n", + "from smolagents.prompts import CODE_SYSTEM_PROMPT\n", "from smolagents import tool, LiteLLMModel\n", "import os\n", "\n", @@ -119,6 +120,11 @@ " api_base=api_base, \n", " api_key=api_key)\n", "\n", + " if system_message is None:\n", + " system_message = CODE_SYSTEM_PROMPT\n", + " else:\n", + " system_message = CODE_SYSTEM_PROMPT + \"\\n\" + system_message\n", + " \n", " agent = CodeAgent(tools=tools, model=model, system_prompt=system_message)\n", " agent.name = name\n", " agent.description = description\n", @@ -300,12 +306,14 @@ " feedback = prompt(f\"\"\"\n", "You are a great reviewer and you like to provide constructive feedback. \n", "If you are provided with a manuscript, you formulate feedback specifically for this manuscript. \n", - "Your goal is to guide the author towards writing\n", + "Your goal is to guide the author towards writing a great manuscript. \n", + "Hence, provide feedback like these examples but focus on what makes sense for the given manuscript:\n", "* a scientific text with a short and descriptive title,\n", "* a scientific text with markdown sub-sections (# title, ## headlines, ...) avoiding bullet points,\n", "* structured in sub-sections by content, e.g. introduction, recent developments, methods, results, discussion, future work, ...\n", "* text using high-quality scientific language,\n", - "* proper citations using markdown links to original paper urls (do not make up references!),\n", + "* proper citations mentioning the first author et al. using markdown links to original paper urls (do not make up references!),\n", + "* avoid mentioning \"the paper\" and use proper markdown-link-citations instead,\n", "* a clear abstract at the beginning of the text, and conclusions at the end\n", "\n", "## Manuscript\n", @@ -473,81 +481,61 @@ "output_type": "stream", "text": [ "--------------------------------------------------------------------------------\n", - "| I am asking research-assistant to take care of: Read the paper https://arxiv.org/abs/2211.11501 and provide a summary of it....[76 chars]\n", + "| I am asking research-assistant to take care of: Summarize the paper https://arxiv.org/abs/2211.11501...[52 chars]\n", "read_arxiv_paper(https://arxiv.org/abs/2211.11501)\n", - "| Response was: The paper introduces the DS-1000 benchmark, which consists of 1000 data science problems spanning 7 ...[371 chars]\n", + "| Response was: The paper introduces the DS-1000 benchmark, a reliable and challenging evaluation platform for data ...[778 chars]\n", "--------------------------------------------------------------------------------\n", "--------------------------------------------------------------------------------\n", - "| I am asking research-assistant to take care of: Read the paper https://arxiv.org/abs/2308.16458 and provide a summary of it....[76 chars]\n", + "| I am asking research-assistant to take care of: Summarize the paper https://arxiv.org/abs/2308.16458...[52 chars]\n", "read_arxiv_paper(https://arxiv.org/abs/2308.16458)\n", - "| Response was: The paper introduces BioCoder, a benchmark for evaluating large language models (LLMs) in generating...[1825 chars]\n", + "| Response was: The paper introduces BioCoder, a benchmark for evaluating the performance of large language models i...[644 chars]\n", "--------------------------------------------------------------------------------\n", "--------------------------------------------------------------------------------\n", - "| I am asking research-assistant to take care of: Read the paper https://arxiv.org/abs/2411.07781 and provide a summary of it....[76 chars]\n", + "| I am asking research-assistant to take care of: Summarize the paper https://arxiv.org/abs/2411.07781...[52 chars]\n", "read_arxiv_paper(https://arxiv.org/abs/2411.07781)\n", - "| Response was: The paper proposes a benchmark for evaluating the safety of code agents, highlighting the need for s...[152 chars]\n", + "| Response was: The paper proposes RedCode, a benchmark for evaluating the safety of code agents, and presents empir...[289 chars]\n", "--------------------------------------------------------------------------------\n", "--------------------------------------------------------------------------------\n", - "| I am asking research-assistant to take care of: Read the paper https://arxiv.org/abs/2408.13204 and provide a summary of it....[76 chars]\n", + "| I am asking research-assistant to take care of: Summarize the paper https://arxiv.org/abs/2408.13204...[52 chars]\n", "read_arxiv_paper(https://arxiv.org/abs/2408.13204)\n", - "| Response was: The paper 'DOMAINEVAL: An Auto-Constructed Benchmark for Multi-Domain Code Generation' introduces a ...[356 chars]\n", + "| Response was: The paper introduces the DOMAINEVAL benchmark for evaluating LLMs' code generation capabilities acro...[388 chars]\n", "--------------------------------------------------------------------------------\n", "--------------------------------------------------------------------------------\n", - "| I am asking research-assistant to take care of: Read the paper https://arxiv.org/abs/2406.15877 and provide a summary of it....[76 chars]\n", + "| I am asking research-assistant to take care of: Summarize the paper https://arxiv.org/abs/2406.15877...[52 chars]\n", "read_arxiv_paper(https://arxiv.org/abs/2406.15877)\n", - "| Response was: The paper introduces BigCodeBench, a benchmark that challenges LLMs to invoke multiple function call...[458 chars]\n", + "| Response was: The paper introduces BigCodeBench, a new benchmark for evaluating LLMs' ability to solve challenging...[538 chars]\n", "--------------------------------------------------------------------------------\n", "--------------------------------------------------------------------------------\n", - "| I am asking reviewer to take care of: Review the manuscript draft: Introduction:\n", - "The recent papers https://arxiv.org/abs/2211.11501, https...[3533 chars]\n", - "review_text(\n", - "Introduction:\n", - "The recent papers https://arxiv.org/abs/2211.11501, https://arxiv.org/abs/2308.16458,...[3506 chars, 15 lines])\n", - "| Response was: The manuscript draft has been reviewed, and the feedback suggests several improvements to enhance it...[244 chars]\n", + "| I am asking reviewer to take care of: Review the manuscript: The recent papers introduce several new benchmarks for evaluating the perform...[777 chars]\n", + "review_text(The recent papers introduce several new benchmarks for evaluating the performance of large language ...[754 chars, 1 lines])\n", + "| Response was: The manuscript provides a good overview of recent benchmarks for evaluating large language models in...[231 chars]\n", "--------------------------------------------------------------------------------\n", "--------------------------------------------------------------------------------\n", - "| I am asking scientific-writer to take care of: Improve the manuscript draft: Introduction:\n", - "The recent papers https://arxiv.org/abs/2211.11501, http...[3809 chars]\n", - "improve_manuscript(\n", - "Introduction:\n", - "The recent papers https://arxiv.org/abs/2211.11501, https://arxiv.org/abs/2308.16458,...[3506 chars, 15 lines], \n", - "The manuscript draft has been reviewed, and the feedback suggests several improvements to enhance i)... [100 chars]\n", - "| Response was: \n", - "### Introduction\n", - "The recent papers [https://arxiv.org/abs/2211.11501](https://arxiv.org/abs/2211.11...[5696 chars]\n", + "| I am asking scientific-writer to take care of: Improve the manuscript: The recent papers introduce several new benchmarks for evaluating the perfor...[1033 chars]\n", + "improve_manuscript(The recent papers introduce several new benchmarks for evaluating the performance of large language ...[754 chars, 1 lines], The manuscript provides a good overview of recent benchmarks for evaluating large language models in)... [100 chars]\n", + "| Response was: # Evaluating Large Language Models in Code Generation: Recent Benchmarks and Future Directions\n", + "## Ab...[2595 chars]\n", "--------------------------------------------------------------------------------\n", "--------------------------------------------------------------------------------\n", - "| I am asking printer to take care of: Print the manuscript: \n", - "### Introduction\n", - "The recent papers [https://arxiv.org/abs/2211.11501](https:/...[5718 chars]\n" + "| I am asking printer to take care of: Print the manuscript: # Evaluating Large Language Models in Code Generation: Recent Benchmarks and F...[2617 chars]\n" ] }, { "data": { "text/markdown": [ "\n", - "### Introduction\n", - "The recent papers [https://arxiv.org/abs/2211.11501](https://arxiv.org/abs/2211.11501), [https://arxiv.org/abs/2308.16458](https://arxiv.org/abs/2308.16458), [https://arxiv.org/abs/2411.07781](https://arxiv.org/abs/2411.07781), [https://arxiv.org/abs/2408.13204](https://arxiv.org/abs/2408.13204), and [https://arxiv.org/abs/2406.15877](https://arxiv.org/abs/2406.15877) introduce new benchmarks and evaluate the performance of large language models (LLMs) on fine-grained tasks.\n", + "# Evaluating Large Language Models in Code Generation: Recent Benchmarks and Future Directions\n", + "## Abstract\n", + "The recent introduction of several new benchmarks has significantly advanced the evaluation of large language models (LLMs) in generating code. This manuscript provides an overview of these benchmarks, including [BigCodeBench](https://example.com/BigCodeBench), [DS-1000](https://example.com/DS-1000), [BioCoder](https://example.com/BioCoder), [RedCode](https://example.com/RedCode), and [DOMAINEVAL](https://example.com/DOMAINEVAL), and discusses their implications for the development of LLMs.\n", "\n", - "### Summary of Papers\n", - "The paper introduces the DS-1000 benchmark, which consists of 1000 data science problems spanning 7 Python libraries. The benchmark has three core features: realistic and practical use cases, reliable automatic evaluation, and defense against memorization. The current best public system, Codex-002, achieves 43.3% accuracy on the benchmark, leaving room for improvement.\n", + "## Introduction\n", + "The recent papers introduce several new benchmarks for evaluating the performance of large language models (LLMs) in generating code. These benchmarks include [BigCodeBench](https://example.com/BigCodeBench), [DS-1000](https://example.com/DS-1000), [BioCoder](https://example.com/BioCoder), [RedCode](https://example.com/RedCode), and [DOMAINEVAL](https://example.com/DOMAINEVAL). The results of these papers show that LLMs are not yet capable of following complex instructions to use function calls precisely and struggle with certain tasks such as cryptography and system coding. However, they also demonstrate the potential of LLMs in generating bioinformatics-specific code and highlight the importance of domain-specific knowledge.\n", "\n", - "The paper introduces [BioCoder](https://arxiv.org/abs/2308.16458), a benchmark for evaluating large language models (LLMs) in generating bioinformatics-specific code. The benchmark consists of 1,026 Python functions and 1,243 Java methods extracted from GitHub, as well as 253 examples from the Rosalind Project. The paper also presents the results of evaluating various models using BioCoder, including InCoder, CodeGen, and GPT-3.5. The results show that successful models accommodate long prompts with full context and contain domain-specific knowledge of bioinformatics. One of the major contributions of the paper is the development of the BioCoder benchmark, which provides a comprehensive evaluation of LLMs in generating bioinformatics code. The benchmark covers a wide range of bioinformatics calculations and includes a fuzz-testing framework for evaluation. The paper also demonstrates the effectiveness of fine-tuning a model (StarCoder) on the BioCoder dataset, resulting in a significant improvement in performance. However, the paper also highlights some limitations of the study. One of the limitations is that the BioCoder benchmark is limited to bioinformatics-specific code and may not be generalizable to other domains. Another limitation is that the evaluation of the models is based on a specific set of metrics (Pass@K), which may not capture all aspects of the models' performance. In conclusion, the paper provides a significant contribution to the field of bioinformatics and natural language processing. The development of the BioCoder benchmark and the evaluation of various models provide valuable insights into the capabilities and limitations of LLMs in generating bioinformatics code. However, further research is needed to address the limitations of the study and to explore the generalizability of the results to other domains.\n", + "## Discussion\n", + "Overall, these benchmarks provide a challenging and reliable evaluation platform for data science code generation models and emphasize the need for further research and development. The results of these benchmarks have significant implications for the development of LLMs, highlighting the need for improved performance in following complex instructions and generating code for specific domains. Furthermore, the benchmarks demonstrate the potential of LLMs in generating high-quality code for certain tasks, such as bioinformatics, and emphasize the importance of incorporating domain-specific knowledge into LLMs.\n", "\n", - "The paper proposes a benchmark for evaluating the safety of code agents, highlighting the need for stringent safety evaluations for diverse code agents.\n", - "\n", - "The paper '[DOMAINEVAL: An Auto-Constructed Benchmark for Multi-Domain Code Generation](https://arxiv.org/abs/2411.07781)' introduces a new benchmark for evaluating LLMs' coding capabilities in multiple domains. The results show that LLMs have strengths and weaknesses in different domains, and highlight the importance of generating more samples and developing more comprehensive benchmarks.\n", - "\n", - "The paper introduces [BigCodeBench](https://arxiv.org/abs/2408.13204), a benchmark that challenges LLMs to invoke multiple function calls as tools from various libraries and domains for fine-grained tasks. The evaluation of 60 LLMs shows that they are not yet capable of following complex instructions to use function calls precisely, with scores up to 60% compared to human performance of 97%. This highlights the limitations of current LLMs and the need for further advancements in this area.\n", - "\n", - "### Conclusion\n", - "The manuscripts discussed above provide a comprehensive overview of the current state of large language models in fine-grained tasks. By introducing new benchmarks such as DS-1000, BioCoder, DOMAINEVAL, and BigCodeBench, these papers highlight the strengths and limitations of current LLMs and provide valuable insights into their capabilities and limitations. The results of the evaluations demonstrate the need for further research to address the limitations of current LLMs and to develop more comprehensive benchmarks that can accurately assess their performance. Overall, the papers contribute significantly to the field of natural language processing and provide a foundation for future research in this area.\n", - "\n", - "### Future Work\n", - "To further improve the performance of LLMs on fine-grained tasks, future research should focus on developing more comprehensive benchmarks that can accurately assess their capabilities and limitations. Additionally, the development of new models that can accommodate long prompts with full context and contain domain-specific knowledge is crucial. The evaluation of models using a wide range of metrics is also essential to capture all aspects of their performance. By addressing the limitations of current LLMs and developing more comprehensive benchmarks, future research can lead to significant advancements in the field of natural language processing.\n", - "\n", - "### References\n", - "The recent papers [https://arxiv.org/abs/2211.11501](https://arxiv.org/abs/2211.11501), [https://arxiv.org/abs/2308.16458](https://arxiv.org/abs/2308.16458), [https://arxiv.org/abs/2411.07781](https://arxiv.org/abs/2411.07781), [https://arxiv.org/abs/2408.13204](https://arxiv.org/abs/2408.13204), and [https://arxiv.org/abs/2406.15877](https://arxiv.org/abs/2406.15877) provide a comprehensive overview of the current state of large language models in fine-grained tasks.\n" + "## Conclusion\n", + "In conclusion, the recent benchmarks for evaluating LLMs in code generation have provided significant insights into the capabilities and limitations of these models. The results of these benchmarks highlight the need for further research and development to improve the performance of LLMs in generating code, particularly in areas such as cryptography and system coding. However, they also demonstrate the potential of LLMs in generating high-quality code for specific domains, such as bioinformatics, and emphasize the importance of incorporating domain-specific knowledge into these models.\n" ], "text/plain": [ "" @@ -568,16 +556,16 @@ "source": [ "manuscript = scheduler.run(\"\"\"\n", "Please take care of ALL the following tasks:\n", - "* Read these papers and summarize them individually.\n", + "* Read these papers and summarize them\n", " * https://arxiv.org/abs/2211.11501\n", " * https://arxiv.org/abs/2308.16458\n", " * https://arxiv.org/abs/2411.07781\n", " * https://arxiv.org/abs/2408.13204\n", " * https://arxiv.org/abs/2406.15877\n", - "* write a manuscript text about the papers, \n", - "* review the manuscript to get constructive feedback\n", - "* improve the manuscript \n", - "* print the final manuscript\n", + "* Combine the information gained above and write a manuscript text about the papers, \n", + "* Afterwards, review the manuscript to get constructive feedback\n", + "* Use the feedback to improve the manuscript \n", + "* Print the final manuscript\n", "\"\"\")" ] },