From 318f0c75b6230a169398a38353c7cffcf16eb318 Mon Sep 17 00:00:00 2001 From: Gully Burns <45613102+GullyBurns@users.noreply.github.com> Date: Thu, 23 Nov 2023 04:35:06 -0800 Subject: [PATCH] Closes #900 (version 2) (#904) Merged and updated in hub --- bigbio/hub/hub_repos/czi_drsm/README.md | 61 +++++++++++++---------- bigbio/hub/hub_repos/czi_drsm/czi_drsm.py | 18 +++---- 2 files changed, 44 insertions(+), 35 deletions(-) diff --git a/bigbio/hub/hub_repos/czi_drsm/README.md b/bigbio/hub/hub_repos/czi_drsm/README.md index 2d950e6e..01266588 100644 --- a/bigbio/hub/hub_repos/czi_drsm/README.md +++ b/bigbio/hub/hub_repos/czi_drsm/README.md @@ -4,7 +4,7 @@ language: bigbio_language: - English license: cc0-1.0 -bigbio_license_shortname: cc0-1.0 +bigbio_license_shortname: CC0_1p0 multilinguality: monolingual pretty_name: CZI DRSM homepage: https://github.com/chanzuckerberg/DRSM-corpus @@ -26,40 +26,49 @@ bigbio_tasks: Research Article document classification dataset based on aspects of disease research. Currently, the dataset consists of three subsets: (A) classifies title/abstracts of papers into most popular subtypes of clinical, basic, and translational papers (~20k papers); - - Clinical Characteristics, Disease Pathology, and Diagnosis - - Text that describes (A) symptoms, signs, or ‘phenotype’ of a disease; - (B) the effects of the disease on patient organs, tissues, or cells; - (C) the results of clinical tests that reveal pathology (including - biomarkers); (D) research that use this information to figure out + + - Clinical Characteristics, Disease Pathology, and Diagnosis: + Text that describes (i) symptoms, signs, or ‘phenotype’ of a disease; + (ii) the effects of the disease on patient organs, tissues, or cells; + (iii)) the results of clinical tests that reveal pathology (including + biomarkers); (iv) research that use this information to figure out a diagnosis. - - Therapeutics in the clinic - + + - Therapeutics in the clinic: Text describing how treatments work in the clinic (but not in a clinical trial). - - Disease mechanism - - Text that describes either (A) mechanistic involvement of specific genes in disease - (deletions, gain of function, etc); (B) how molecular signalling or metabolism - binding, activating, phosphorylation, concentration increase, etc.) - are involved in the mechanism of a disease; or (C) the physiological - mechanism of disease at the level of tissues, organs, and body systems. - - Patient-Based Therapeutics - - Text describing (A) Clinical trials (studies of therapeutic measures being - used on patients in a clinical trial); (B) Post Marketing Drug Surveillance + + - Disease mechanism: + + - Patient-Based Therapeutics: + Text describing (i) Clinical trials (studies of therapeutic measures being + used on patients in a clinical trial); (ii) Post Marketing Drug Surveillance (effects of a drug after approval in the general population or as part of - ‘standard healthcare’); (C) Drug repurposing (how a drug that has been + ‘standard healthcare’); (iii) Drug repurposing (how a drug that has been approved for one use is being applied to a new disease). (B) identifies whether a title/abstract of a paper describes substantive research into Quality of Life (~10k papers); - - -1 - the paper is not a primary experimental study in rare disease - - 0 - the study does not directly investigate quality of life - - 1 - the study investigates qol but not as its primary contribution - - 2 - the study's primary contribution centers on quality of life measures + + - [-1] - the paper is not a primary experimental study in rare disease + + - [0] - the study does not directly investigate quality of life + + - [1] - the study investigates qol but not as its primary contribution + + - [2] - the study's primary contribution centers on quality of life measures (C) identifies if a paper is a natural history study (~10k papers). - - -1 - the paper is not a primary experimental study in rare disease - - 0 - the study is not directly investigating the natural history of a disease - - 1 - the study includes some elements a natural history but not as its primary contribution - - 2 - the study's primary contribution centers on observing the time course of a rare disease + + - [-1] - the paper is not a primary experimental study in rare disease + + - [0] - the study is not directly investigating the natural history of a disease + + - [1] - the study includes some elements a natural history but not as its primary contribution + + - [2] - the study's primary contribution centers on observing the time course of a rare disease -These classifications are particularly relevant in rare disease research, a field that is generally understudied. +These classifications are particularly relevant in rare disease research, a field that is generally understudied. + +This data was compiled through the use of a gamified curation approach based on CentaurLabs' 'diagnos.us' platform. ## Citation Information diff --git a/bigbio/hub/hub_repos/czi_drsm/czi_drsm.py b/bigbio/hub/hub_repos/czi_drsm/czi_drsm.py index 68e3ed3b..202907bd 100644 --- a/bigbio/hub/hub_repos/czi_drsm/czi_drsm.py +++ b/bigbio/hub/hub_repos/czi_drsm/czi_drsm.py @@ -60,12 +60,9 @@ import pandas as pd from pathlib import Path -from .bigbiohub import text_features, BigBioConfig, Tasks - -#from .bigbiohub import BigBioConfig -#from .bigbiohub import Tasks - -#from .bigbiohub import +from .bigbiohub import text_features +from .bigbiohub import BigBioConfig +from .bigbiohub import Tasks _LOCAL = False @@ -89,6 +86,7 @@ Research Article document classification dataset based on aspects of disease research. Currently, the dataset consists of three subsets: (A) classifies title/abstracts of papers into most popular subtypes of clinical, basic, and translational papers (~20k papers); + - Clinical Characteristics, Disease Pathology, and Diagnosis - Text that describes (A) symptoms, signs, or ‘phenotype’ of a disease; (B) the effects of the disease on patient organs, tissues, or cells; @@ -111,13 +109,15 @@ approved for one use is being applied to a new disease). (B) identifies whether a title/abstract of a paper describes substantive research into Quality of Life (~10k papers); + - -1 - the paper is not a primary experimental study in rare disease - 0 - the study does not directly investigate quality of life - 1 - the study investigates qol but not as its primary contribution - 2 - the study's primary contribution centers on quality of life measures (C) identifies if a paper is a natural history study (~10k papers). -` - -1 - the paper is not a primary experimental study in rare disease +` + - -1 - the paper is not a primary experimental study in rare disease - 0 - the study is not directly investigating the natural history of a disease - 1 - the study includes some elements a natural history but not as its primary contribution - 2 - the study's primary contribution centers on observing the time course of a rare disease @@ -126,7 +126,7 @@ """ _HOMEPAGE = "https://github.com/chanzuckerberg/DRSM-corpus/" -_LICENSE = "CC0_1p0" +_LICENSE = 'CC0_1p0' _LANGUAGES = ['English'] _PUBMED = False @@ -143,7 +143,7 @@ 'nhs': "https://raw.githubusercontent.com/chanzuckerberg/DRSM-corpus/main/v2/nhs_all_2023_03_31.tsv" } -_SUPPORTED_TASKS = [Tasks.TEXT_CLASSIFICATION] +_SUPPORTED_TASKS = [Tasks.TEXT_CLASSIFICATION] _SOURCE_VERSION = "1.0.0" _BIGBIO_VERSION = "1.0.0"