diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..559b00b28 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,60 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +# See https://github.com/crmne/cookiecutter-modern-datascience +fail_fast: true +exclude: '^$' +files: ^bigbio/biodatasets/ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-case-conflict + - id: debug-statements + - id: detect-private-key + - id: check-merge-conflict + - id: check-added-large-files + # - repo: https://github.com/myint/autoflake + # rev: v1.7.6 + # hooks: + # - id: autoflake + # args: + # - --in-place + # - --remove-duplicate-keys + # - --remove-unused-variables + # - --remove-all-unused-imports + # - --expand-star-imports + - repo: https://github.com/PyCQA/flake8 + rev: 5.0.4 + hooks: + - id: flake8 + args: + - --max-line-length + - '119' + - repo: https://github.com/PyCQA/isort + rev: 5.10.1 + hooks: + - id: isort + args: + - --profile + - black + - repo: https://github.com/ambv/black + rev: 22.10.0 + hooks: + - id: black + args: + - --line-length + - '119' + - --target-version + - py38 + - repo: local + hooks: + - id: test-bigbio + name: running bigbio unit tests + entry: python -m tests.test_bigbio + language: system + files: ^bigbio/biodatasets/ + pass_filenames: true + # always_run: true \ No newline at end of file diff --git a/bigbio/biodatasets/an_em/an_em.py b/bigbio/biodatasets/an_em/an_em.py index ab479b664..a52fd8fa8 100644 --- a/bigbio/biodatasets/an_em/an_em.py +++ b/bigbio/biodatasets/an_em/an_em.py @@ -54,11 +54,11 @@ _DISPLAYNAME = "AnEM" _DESCRIPTION = """\ -AnEM corpus is a domain- and species-independent resource manually annotated for anatomical -entity mentions using a fine-grained classification system. The corpus consists of 500 documents -(over 90,000 words) selected randomly from citation abstracts and full-text papers with -the aim of making the corpus representative of the entire available biomedical scientific -literature. The corpus annotation covers mentions of both healthy and pathological anatomical +AnEM corpus is a domain- and species-independent resource manually annotated for anatomical \ +entity mentions using a fine-grained classification system. The corpus consists of 500 documents \ +(over 90,000 words) selected randomly from citation abstracts and full-text papers with \ +the aim of making the corpus representative of the entire available biomedical scientific \ +literature. The corpus annotation covers mentions of both healthy and pathological anatomical \ entities and contains over 3,000 annotated mentions. """ @@ -167,10 +167,7 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: name=datasets.Split.TRAIN, gen_kwargs={ "filepath": all_data, - "split_path": data_dir - / "AnEM-1.0.4" - / "development" - / "train-files.list", + "split_path": data_dir / "AnEM-1.0.4" / "development" / "train-files.list", "split": "train", }, ), @@ -186,10 +183,7 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: name=datasets.Split.VALIDATION, gen_kwargs={ "filepath": all_data, - "split_path": data_dir - / "AnEM-1.0.4" - / "development" - / "test-files.list", + "split_path": data_dir / "AnEM-1.0.4" / "development" / "test-files.list", "split": "dev", }, ), @@ -251,10 +245,7 @@ def _brat_to_source(self, filepath, brat_example): "equivalences": [ { "entity_id": brat_entity["id"], - "ref_ids": [ - f"{brat_example['document_id']}_{ids}" - for ids in brat_entity["ref_ids"] - ], + "ref_ids": [f"{brat_example['document_id']}_{ids}" for ids in brat_entity["ref_ids"]], } for brat_entity in brat_example["equivalences"] ], diff --git a/streamlit_demo/vis_data_card.py b/streamlit_demo/vis_data_card.py index 0d89c7243..a6b64a910 100644 --- a/streamlit_demo/vis_data_card.py +++ b/streamlit_demo/vis_data_card.py @@ -235,7 +235,7 @@ def gen_latex(dataset_name, helper, splits, schemas, fig_path): r"Token frequency distribution by split (top) and frequency of different kind of instances (bottom).}" + "\n" ) - latex_bod += r"\end{figure}" + "\n" + r"\textbf{Dataset Description} " + latex_bod += r"\end{figure}" + "\n" + r"\textbf{Dataset Description:} " latex_bod += ( fr"{descriptions}" + "\n" @@ -403,4 +403,3 @@ def draw_figure(data_name, data_config_name, schema_type): latex_name = f"{data_name}_{config_name}.tex" write_latex(latex_bod, latex_name) print(latex_bod) -