Skip to content

Commit

Permalink
Merge branch 'main' into improving_data_preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
lukasgarbas committed Nov 22, 2024
2 parents aad7d7f + 86676c5 commit 67b5f14
Show file tree
Hide file tree
Showing 16 changed files with 176 additions and 166 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.8"]
python-version: ["3.9"]

steps:
- name: Checkout the repository
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Developed at <a href="https://www.informatik.hu-berlin.de/en/forschung-en/gebiet
But which one of them is best for your NLP classification task?
Since fine-tuning LMs is costly, it is not possible to try them all!

**The solution**: *Tranferability estimation* with TransformerRanker!
**The solution**: *Transferability estimation* with TransformerRanker!

---
TransformerRanker is a library that
Expand Down
19 changes: 9 additions & 10 deletions examples/code_examples/chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,18 @@
from transformer_ranker import TransformerRanker

# Load the 'conll2003' dataset
dataset = load_dataset('conll2003')
dataset = load_dataset("conll2003")

# Use smaller models to test on CPU
models = ['prajjwal1/bert-tiny',
'google/electra-small-discriminator',
'microsoft/deberta-v3-small',
'bert-base-uncased',
]
# Use smaller models to run on CPU
models = [
"prajjwal1/bert-tiny",
"google/electra-small-discriminator",
"microsoft/deberta-v3-small",
"bert-base-uncased",
]

# Initialize the ranker, set labels to chunk tags
ranker = TransformerRanker(dataset=dataset,
dataset_downsample=0.2,
label_column='chunk_tags')
ranker = TransformerRanker(dataset=dataset, dataset_downsample=0.2, label_column="chunk_tags")

# ... and run it
result = ranker.run(models=models, batch_size=64)
Expand Down
22 changes: 12 additions & 10 deletions examples/code_examples/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,23 @@
from transformer_ranker import TransformerRanker

# Load and inspect the 'trec' dataset
dataset = load_dataset('trec')
dataset = load_dataset("trec")
print(dataset)

# Use smaller models to run on CPU
language_models = ['prajjwal1/bert-tiny',
'google/electra-small-discriminator',
'microsoft/deberta-v3-small',
'bert-base-uncased',
]
language_models = [
"prajjwal1/bert-tiny",
"google/electra-small-discriminator",
"microsoft/deberta-v3-small",
"bert-base-uncased",
]

# Initialize the ranker
ranker = TransformerRanker(dataset=dataset,
dataset_downsample=0.2,
label_column="coarse_label",
)
ranker = TransformerRanker(
dataset=dataset,
dataset_downsample=0.2,
label_column="coarse_label",
)

# ... and run it
result = ranker.run(models=language_models, batch_size=32)
Expand Down
15 changes: 8 additions & 7 deletions examples/code_examples/entailment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@
from transformer_ranker import TransformerRanker

# Load 'rte' Recognizing Textual Entailment dataset
entailment_dataset = load_dataset('glue', 'rte')
entailment_dataset = load_dataset("glue", "rte")

# Use smaller models to run on CPU
language_models = ['prajjwal1/bert-tiny',
'google/electra-small-discriminator',
'microsoft/deberta-v3-small',
'bert-base-uncased',
]
language_models = [
"prajjwal1/bert-tiny",
"google/electra-small-discriminator",
"microsoft/deberta-v3-small",
"bert-base-uncased",
]

# Initialize the ranker, set text_pair_column
# Initialize the ranker, set column for text pairs
ranker = TransformerRanker(dataset=entailment_dataset, text_pair_column="sentence2")

# ... and run it
Expand Down
6 changes: 3 additions & 3 deletions examples/code_examples/multiple_runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@
from transformer_ranker import TransformerRanker

# Load a dataset, initialize the ranker
dataset = load_dataset('trec')
dataset = load_dataset("trec")
ranker = TransformerRanker(dataset=dataset, dataset_downsample=0.2)

# Load smaller models
models = ['prajjwal1/bert-tiny', 'google/electra-small-discriminator']
models = ["prajjwal1/bert-tiny", "google/electra-small-discriminator"]

# ... and rank them using a large batch size
result = ranker.run(models=models, batch_size=124)
print(result)

# Add larger models
models = ['bert-large-cased', 'google/electra-large-discriminator']
models = ["bert-large-cased", "google/electra-large-discriminator"]

# ... and rank them using a small batch size
result.append(ranker.run(models=models, batch_size=16))
Expand Down
15 changes: 8 additions & 7 deletions examples/code_examples/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
from transformer_ranker import TransformerRanker

# Load a regression dataset
regression_dataset = load_dataset('glue', 'stsb')
regression_dataset = load_dataset("glue", "stsb")

# You can test on cpu using smaller models
models = ['prajjwal1/bert-tiny',
'google/electra-small-discriminator',
'microsoft/deberta-v3-small',
'bert-base-uncased',
]
# Use smaller models to run on CPU
models = [
"prajjwal1/bert-tiny",
"google/electra-small-discriminator",
"microsoft/deberta-v3-small",
"bert-base-uncased",
]

# Initialize the ranker, set the text pair column
ranker = TransformerRanker(dataset=regression_dataset, text_pair_column="sentence2")
Expand Down
17 changes: 8 additions & 9 deletions examples/code_examples/tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,18 @@
from transformer_ranker import TransformerRanker

# Load the WNUT-17 NER dataset of English tweets
dataset_ner = load_dataset('leondz/wnut_17')
dataset_ner = load_dataset("leondz/wnut_17")

# Use smaller models to test on CPU
models = ['prajjwal1/bert-tiny',
'google/electra-small-discriminator',
'microsoft/deberta-v3-small',
'bert-base-uncased',
]
models = [
"prajjwal1/bert-tiny",
"google/electra-small-discriminator",
"microsoft/deberta-v3-small",
"bert-base-uncased",
]

# Initialize the ranker, set labels to ner tags
ranker = TransformerRanker(dataset=dataset_ner,
dataset_downsample=0.2,
label_column='ner_tags')
ranker = TransformerRanker(dataset=dataset_ner, dataset_downsample=0.2, label_column="ner_tags")

# ... and run it
result = ranker.run(models=models, batch_size=64)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ def read_requirements():
url="https://github.com/flairNLP/transformer-ranker",
install_requires=read_requirements(),
license='MIT',
python_requires=">=3.8",
python_requires=">=3.9",
)
45 changes: 22 additions & 23 deletions transformer_ranker/datacleaner.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import Dict, List, Optional, Tuple, Type, Union
from typing import Optional, Type, Union

import datasets
import torch
Expand All @@ -22,7 +22,7 @@ def __init__(
task_type: Optional[str] = None,
text_column: Optional[str] = None,
label_column: Optional[str] = None,
label_map: Optional[Dict[str, int]] = None,
label_map: Optional[dict[str, int]] = None,
text_pair_column: Optional[str] = None,
):
"""
Expand All @@ -34,7 +34,7 @@ def __init__(
:param change_bio_encoding: Convert BIO to single-class labels, removing B-, I-, O- prefix.
:param remove_empty_sentences: Whether to remove empty sentences.
:param dataset_downsample: Fraction to reduce the dataset size.
:param task_type: Task category "token classification", "text classification", "text regression".
:param task_type: "token classification", "text classification", or "text regression".
:param text_column: Column name for texts.
:param label_column: Column name for labels.
:param label_map: A dictionary which maps label names to integers.
Expand Down Expand Up @@ -107,7 +107,7 @@ def prepare_dataset(
)

# Convert string labels to integers
if label_type is str:
if isinstance(label_type, str):
dataset, self.label_map = self._make_labels_categorical(dataset, label_column)

# Try to find label map in the dataset
Expand All @@ -120,7 +120,10 @@ def prepare_dataset(
dataset, label_column, self.label_map
)

logger.info("Label map: %s", self.label_map)
# Keep only text and label columns
keep_columns = {text_column, self.text_pair_column, label_column} - {None}
columns_to_remove = list(set(dataset.column_names) - keep_columns)
dataset = dataset.remove_columns(columns_to_remove)

# Set updated attributes and log them
self.text_column = text_column
Expand All @@ -129,11 +132,6 @@ def prepare_dataset(
self.dataset_size = len(dataset)
self.log_dataset_info()

# Keep only text and label columns
keep_columns = {self.text_column, self.text_pair_column, self.label_column} - {None}
columns_to_remove = list(set(dataset.column_names) - keep_columns)
dataset = dataset.remove_columns(columns_to_remove)

return dataset

def prepare_labels(self, dataset: Dataset) -> torch.Tensor:
Expand All @@ -147,7 +145,7 @@ def prepare_labels(self, dataset: Dataset) -> torch.Tensor:
)
return torch.tensor(labels)

def prepare_sentences(self, dataset: Dataset) -> List[str]:
def prepare_sentences(self, dataset: Dataset) -> list[str]:
"""Gather sentences in the text column."""
return dataset[self.text_column]

Expand All @@ -160,7 +158,7 @@ def _downsample(dataset: Dataset, ratio: float) -> Dataset:
@staticmethod
def _find_text_and_label_columns(
dataset: Dataset, text_column: Optional[str] = None, label_column: Optional[str] = None
) -> Tuple[str, str, Type]:
) -> tuple[str, str, Type]:
"""Find text and label columns in hf datasets based on common keywords"""
text_columns = [
"text", "sentence", "token", "tweet", "document", "paragraph", "description",
Expand Down Expand Up @@ -196,7 +194,7 @@ def _find_text_and_label_columns(
@staticmethod
def _merge_textpairs(
dataset: Dataset, text_column: str, text_pair_column: str
) -> Tuple[Dataset, str]:
) -> tuple[Dataset, str]:
"""Concatenate text pairs into a single text using separator token"""
new_text_column_name = text_column + "+" + text_pair_column

Expand All @@ -206,7 +204,7 @@ def _merge_textpairs(
f"Use one of the following names for tex pair: {dataset.column_names}."
)

def merge_texts(dataset_entry: Dict[str, str]) -> Dict[str, str]:
def merge_texts(dataset_entry: dict[str, str]) -> dict[str, str]:
dataset_entry[text_column] = (
dataset_entry[text_column] + " [SEP] " + dataset_entry[text_pair_column]
)
Expand Down Expand Up @@ -244,7 +242,7 @@ def pre_tokenize(example):
example[text_column] = [token for token, _ in encoding]
return example

dataset = dataset.map(pre_tokenize, num_proc=None, desc="Pre-tokenizing texts with Whitespace")
dataset = dataset.map(pre_tokenize, num_proc=None, desc="Whitespace pre-tokenization")
return dataset

@staticmethod
Expand Down Expand Up @@ -287,7 +285,7 @@ def is_valid_entry(sample) -> bool:
@staticmethod
def _make_labels_categorical(
dataset: Dataset, label_column: str
) -> Tuple[Dataset, Dict[str, int]]:
) -> tuple[Dataset, dict[str, int]]:
"""Convert string labels to integers"""
unique_labels = sorted(set(dataset[label_column]))
label_map = {label: idx for idx, label in enumerate(unique_labels)}
Expand All @@ -300,7 +298,7 @@ def map_labels(dataset_entry):
return dataset, label_map

@staticmethod
def _create_label_map(dataset: Dataset, label_column: str) -> Dict[str, int]:
def _create_label_map(dataset: Dataset, label_column: str) -> dict[str, int]:
"""Try to find feature names in a hf dataset."""
label_names = getattr(
getattr(dataset.features[label_column], "feature", None), "names", None
Expand All @@ -320,8 +318,8 @@ def _create_label_map(dataset: Dataset, label_column: str) -> Dict[str, int]:

@staticmethod
def _change_bio_encoding(
dataset: Dataset, label_column: str, label_map: Dict[str, int]
) -> Tuple[Dataset, Dict[str, int]]:
dataset: Dataset, label_column: str, label_map: dict[str, int]
) -> tuple[Dataset, dict[str, int]]:
"""Remove BIO prefixes from NER labels, update the dataset, and create a new label map."""

# Get unique labels without BIO prefixes and create new label map
Expand All @@ -343,15 +341,16 @@ def _change_bio_encoding(
if label_map == new_label_map:
logger.warning(
"Could not remove BIO prefixes for this tagging dataset. "
"Please add the label map as parameter label_map: Dict[str, int] = ... manually."
"Please add the label map as parameter label_map: dict[str, int] = ... manually."
)

return dataset, new_label_map

def log_dataset_info(self) -> None:
"""Log information about dataset"""
logger.info("Texts and labels: '%s', '%s'", self.text_column, self.label_column)
logger.info("Task category: '%s'", self.task_type)
logger.info(f"Texts and labels: {self.text_column}, {self.label_column}")
logger.info(f"Label map: {self.label_map}")
is_downsampled = self.dataset_downsample and self.dataset_downsample < 1.0
downsample_info = f"(down-sampled to {self.dataset_downsample})" if is_downsampled else ""
logger.info("Dataset size: %s texts %s", self.dataset_size, downsample_info)
logger.info(f"Dataset size: {self.dataset_size} texts {downsample_info}")
logger.info(f"Task category: {self.task_type}")
Loading

0 comments on commit 67b5f14

Please sign in to comment.