Merge pull request #362 from EleutherAI/cleanup-for-release

Cleanup `README.md` and package deps
EleutherAI · Dec 7, 2022 · 1d8107b · 1d8107b
2 parents fdd3dbc + 1e5d55d
commit 1d8107b
Show file tree

Hide file tree

Showing 26 changed files with 434 additions and 440 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -32,7 +32,9 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install flake8 pytest pytest-cov
-        pip install -e .[dev]
+        pip install -e .[dev,multilingual]
+        # Install optional git dependencies
+        pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
     - name: Lint with flake8
       run: |

diff --git a/README.md b/README.md
diff --git a/docs/task_table.md b/docs/task_table.md
diff --git a/lm_eval/tasks/translation.py b/lm_eval/tasks/translation.py
@@ -16,6 +16,20 @@
 from lm_eval.base import Task, rf
 from typing import List
 
+try:
+    import nagisa
+
+    HAS_NAGISA = True
+except ImportError:
+    HAS_NAGISA = False
+
+try:
+    import jieba
+
+    HAS_JIEBA = True
+except ImportError:
+    HAS_JIEBA = False
+
 
 _CITATION = """
 @inproceedings{post-2018-call,
@@ -63,14 +77,22 @@ def version_of(dataset, language_pair):
 
 def zh_split(zh_text: List[str]) -> List[str]:
     """Chinese splitting"""
-    import jieba
+    if not HAS_JIEBA:
+        raise ImportError(
+            "Chinese text splitting requires the `jieba` package. "
+            "Please install it with:\npip install jieba"
+        )
 
     return [" ".join(jieba.cut(txt.strip())) for txt in zh_text]
 
 
 def ja_split(ja_text: List[str]) -> List[str]:
     """Japanese splitting"""
-    import nagisa
+    if not HAS_NAGISA:
+        raise ImportError(
+            "Japanese text splitting requires the `nagisa` package. "
+            "Please install it with:\npip install nagisa"
+        )
 
     return [" ".join(nagisa.tagging(txt.strip()).words) for txt in ja_text]
 

diff --git a/lm_eval/tasks/truthfulqa.py b/lm_eval/tasks/truthfulqa.py
@@ -27,6 +27,14 @@
 from lm_eval.metrics import mean
 
 
+try:
+    import bleurt
+
+    HAS_BLEURT = True
+except ImportError:
+    HAS_BLEURT = False
+
+
 _CITATION = """
 @misc{lin2021truthfulqa,
     title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
@@ -164,6 +172,12 @@ class TruthfulQAGeneration(Task):
 
     def __init__(self):
         super().__init__()
+        if not HAS_BLEURT:
+            raise ImportError(
+                "`TruthfulQAGeneration` requires the `bleurt` package. Please install it with:\n"
+                "pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt"
+                "\nWARNING: Installing any other version of bleurt may result in different results."
+            )
         self.bleurt = datasets.load_metric("bleurt")
 
     def has_training_docs(self):

diff --git a/lm_eval/utils.py b/lm_eval/utils.py
@@ -5,7 +5,6 @@
 import functools
 import inspect
 import sys
-import pytest
 from typing import List
 
 
@@ -187,6 +186,8 @@ def run_task_tests(task_list: List[str]):
     """
     Find the package root and run the tests for the given tasks
     """
+    import pytest
+
     package_root = find_test_root(start_path=pathlib.Path(__file__))
     task_string = " or ".join(task_list)
     args = [

diff --git a/scripts/make_table_tasks.py b/scripts/make_table_tasks.py
@@ -1,33 +1,52 @@
+"""
+Usage:
+   python make_table_tasks.py --output <markdown_filename>
+"""
+import argparse
+import logging
 from lm_eval import tasks
 from pytablewriter import MarkdownTableWriter
 
-writer = MarkdownTableWriter()
-writer.headers = ["Task Name", "Train", "Val", "Test", "Val/Test Docs", "Metrics"]
 
-values = []
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 
 
-def chk(tf):
+def check(tf):
     if tf:
         return "✓"
     else:
         return " "
 
 
-for tname, Task in tasks.TASK_REGISTRY.items():
-    task = Task()
-
-    v = [
-        tname,
-        chk(task.has_training_docs()),
-        chk(task.has_validation_docs()),
-        chk(task.has_test_docs()),
-        len(list(task.test_docs() if task.has_test_docs() else task.validation_docs())),
-        ", ".join(task.aggregation().keys()),
-    ]
-    print(v)
-    values.append(v)
-
-writer.value_matrix = values
-
-print(writer.dumps())
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output", type=str, default="task_table.md")
+    args = parser.parse_args()
+
+    writer = MarkdownTableWriter()
+    writer.headers = ["Task Name", "Train", "Val", "Test", "Val/Test Docs", "Metrics"]
+    values = []
+
+    tasks = tasks.TASK_REGISTRY.items()
+    tasks = sorted(tasks, key=lambda x: x[0])
+    for tname, Task in tasks:
+        task = Task()
+        v = [
+            tname,
+            check(task.has_training_docs()),
+            check(task.has_validation_docs()),
+            check(task.has_test_docs()),
+            len(
+                list(
+                    task.test_docs() if task.has_test_docs() else task.validation_docs()
+                )
+            ),
+            ", ".join(task.aggregation().keys()),
+        ]
+        logger.info(v)
+        values.append(v)
+    writer.value_matrix = values
+    table = writer.dumps()
+    with open(args.output, "w") as f:
+        f.write(table)
diff --git a/setup.py b/setup.py
@@ -14,36 +14,31 @@
     url="https://github.com/EleutherAI/lm-evaluation-harness",
     packages=setuptools.find_packages(),
     classifiers=[
+        "Development Status :: 3 - Alpha",
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
     ],
     python_requires=">=3.6",
     install_requires=[
         "datasets>=2.0.0",
-        "click>=7.1",
+        "jsonlines",
+        "numexpr",
+        "openai>=0.6.4",
+        "pybind11>=2.6.2",
+        "pycountry",
+        "pytablewriter",
+        "rouge-score>=0.0.4",
+        "sacrebleu==1.5.0",
         "scikit-learn>=0.24.1",
+        "sqlitedict",
         "torch>=1.7",
+        "tqdm-multiprocess",
         "transformers>=4.1",
-        "sqlitedict==1.6.0",
-        "pytablewriter==0.58.0",
-        "sacrebleu==1.5.0",
-        "rouge-score==0.0.4",
-        "pycountry==20.7.3",
-        "numexpr>=2.7.2",
-        "lm_dataformat==0.0.20",
-        "pybind11==2.6.2",
-        "tqdm-multiprocess==0.0.11",
-        "zstandard==0.15.2",
-        "jsonlines==2.0.0",
-        "mock==4.0.3",
-        "openai==0.6.4",
-        "jieba==0.42.1",
-        "nagisa==0.2.7",
-        "bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",
-    ],
-    dependency_links=[
-        "https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",
+        "zstandard",
     ],
-    extras_require={"dev": ["pytest", "black", "pre-commit"]},
+    extras_require={
+        "dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
+        "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
+    },
 )
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -258,8 +258,9 @@ def textsynth_mock_completion(**kwargs):
     import requests
 
     os.makedirs("tests/testdata", exist_ok=True)
+    hash_kwargs = {k: v for k, v in kwargs.items() if k != "headers"}
     hash = hashlib.sha256(
-        json.dumps(kwargs, sort_keys=True).encode("utf-8")
+        json.dumps(hash_kwargs, sort_keys=True).encode("utf-8")
     ).hexdigest()
     fname = f"tests/testdata/textsynth_test_{hash}.pkl"
 

diff --git a/tests/test_tasks.py b/tests/test_tasks.py
@@ -7,10 +7,7 @@
 @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
 def test_basic_interface(taskname, task_class):
     print("Evaluating task", taskname)
-    # dl = task_class.download
-    # task_class.download = MagicMock()
     task = task_class()
-    # task_class.download = dl
 
     assert task.has_training_docs() in [True, False]
     assert task.has_validation_docs() in [True, False]

diff --git a/tests/test_version_stable.py b/tests/test_version_stable.py
@@ -51,7 +51,7 @@ def flatten(d, parent_key="", sep="."):
     items = []
     for k, v in d.items():
         new_key = parent_key + sep + k if parent_key else k
-        if isinstance(v, collections.MutableMapping):
+        if isinstance(v, collections.abc.MutableMapping):
             items.extend(flatten(v, new_key, sep=sep).items())
         else:
             items.append((new_key, v))

diff --git a/...5c18295ff6cc3c6e86268f404cfed5aee8428.pkl → ...43931d7f3f0b696970bcba31f9b52bdf12297.pkl b/...5c18295ff6cc3c6e86268f404cfed5aee8428.pkl → ...43931d7f3f0b696970bcba31f9b52bdf12297.pkl
diff --git a/...8e67a3582f72655c7bfaab017c471f8216a1d.pkl → ...2572b95bb57b334fc0752c89a7e045a5f63ae.pkl b/...8e67a3582f72655c7bfaab017c471f8216a1d.pkl → ...2572b95bb57b334fc0752c89a7e045a5f63ae.pkl
diff --git a/...d24ddcdfa906b8675a9f551f84b7bd5810e73.pkl → ...a2ecba7f0c19f9e39f2aaf2bf440bfe328004.pkl b/...d24ddcdfa906b8675a9f551f84b7bd5810e73.pkl → ...a2ecba7f0c19f9e39f2aaf2bf440bfe328004.pkl
diff --git a/...48f355948bcb1dfc73c6cd12e5fa8ebc8390c.pkl → ...3306a5872c400a3872f744280b237455a0f8e.pkl b/...48f355948bcb1dfc73c6cd12e5fa8ebc8390c.pkl → ...3306a5872c400a3872f744280b237455a0f8e.pkl
diff --git a/...4bb6cc4e0ba82260189e55e66e6086654f28a.pkl → ...188b1b2bc103a421c676ee4b2142a68b43516.pkl b/...4bb6cc4e0ba82260189e55e66e6086654f28a.pkl → ...188b1b2bc103a421c676ee4b2142a68b43516.pkl
diff --git a/...2ac6782d0b9fe6e6cb93b77919fe5a4d377ed.pkl → ...98d6fe329299d086a4659743a41f4a4012659.pkl b/...2ac6782d0b9fe6e6cb93b77919fe5a4d377ed.pkl → ...98d6fe329299d086a4659743a41f4a4012659.pkl
diff --git a/...1e542813e8949f6e6f8471a8b2f4a144ba6d6.pkl → ...f255ae19e9e061d533256bdf75b04e0a917ab.pkl b/...1e542813e8949f6e6f8471a8b2f4a144ba6d6.pkl → ...f255ae19e9e061d533256bdf75b04e0a917ab.pkl
diff --git a/...e33658074b94e7eef68b6282ce7b2f76422ea.pkl → ...af419cfac89538d4ab7745621e339394c0c23.pkl b/...e33658074b94e7eef68b6282ce7b2f76422ea.pkl → ...af419cfac89538d4ab7745621e339394c0c23.pkl
diff --git a/...043c89b6816221ef36286b15969febee34757.pkl → ...5fe35822d650aefdc5fbeeaf0c1724effbe09.pkl b/...043c89b6816221ef36286b15969febee34757.pkl → ...5fe35822d650aefdc5fbeeaf0c1724effbe09.pkl
diff --git a/...0a4bcac0bf275fc6ea96d499fb22f5a4c8736.pkl → ...328c759574fa3ec9751631025f8ad5ebf9f3e.pkl b/...0a4bcac0bf275fc6ea96d499fb22f5a4c8736.pkl → ...328c759574fa3ec9751631025f8ad5ebf9f3e.pkl
diff --git a/...3c138b7c17480ffb420d019adb59c367beba6.pkl → ...e4c010d888065f55a8f1b863bc1eb0340a5f2.pkl b/...3c138b7c17480ffb420d019adb59c367beba6.pkl → ...e4c010d888065f55a8f1b863bc1eb0340a5f2.pkl
diff --git a/...681a99d6adada4874a100d5700817bd601615.pkl → ...24ddeb538de2ffe108c1370dd74ce6ac8038d.pkl b/...681a99d6adada4874a100d5700817bd601615.pkl → ...24ddeb538de2ffe108c1370dd74ce6ac8038d.pkl
diff --git a/...47c13aa91bbeff15a1cb72d80ad13b49d3434.pkl → ...8137398a0885ca5d5d98f515404fb6aeb99e7.pkl b/...47c13aa91bbeff15a1cb72d80ad13b49d3434.pkl → ...8137398a0885ca5d5d98f515404fb6aeb99e7.pkl
diff --git a/...64ee44ada219a1a93f90e43b3e0855c8f5e28.pkl → ...ffa4546728dd150a67c0a0ddc8675c04e15d1.pkl b/...64ee44ada219a1a93f90e43b3e0855c8f5e28.pkl → ...ffa4546728dd150a67c0a0ddc8675c04e15d1.pkl
diff --git a/...c1afe579667127dc46033eec75e7617f13cd6.pkl → ...93639e7e2261d98de58159d15ccb83131bf4e.pkl b/...c1afe579667127dc46033eec75e7617f13cd6.pkl → ...93639e7e2261d98de58159d15ccb83131bf4e.pkl