Skip to content

Commit

Permalink
Changed downsampling ratios in test-cases to speed up testing workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
plonerma committed Oct 30, 2024
1 parent d06ce1e commit bcc4d6f
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 24 deletions.
20 changes: 10 additions & 10 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,23 @@

test_datasets = [
# Token datasets
("token classification", "conll2003"),
("token classification", "wnut_17"),
("token classification", "conll2003", 0.01),
("token classification", "wnut_17", 0.05),

# Text classification
("text classification", "trec"),
("text classification", "stanfordnlp/sst2"),
("text classification", "hate_speech18"),
("text classification", "trec", 0.05),
("text classification", "stanfordnlp/sst2", 0.005),
("text classification", "hate_speech18", 0.025),

# Text-pair classification
("text classification", "yangwang825/sick"),
("text classification", "SetFit/rte"),
("text classification", "yangwang825/sick", 0.025),
("text classification", "SetFit/rte", 0.05),
]

@pytest.mark.parametrize("task_type,dataset_name", test_datasets)
def test_datacleaner(task_type, dataset_name):
@pytest.mark.parametrize("task_type,dataset_name,downsampling_ratio", test_datasets)
def test_datacleaner(task_type, dataset_name, downsampling_ratio):

preprocessor = DatasetCleaner(dataset_downsample=0.2)
preprocessor = DatasetCleaner(dataset_downsample=downsampling_ratio)
dataset = preprocessor.prepare_dataset(dataset_name)

# Test dataset preprocessing
Expand Down
16 changes: 4 additions & 12 deletions tests/test_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,15 @@


def test_embedder_inputs(small_language_models):
embeddings = {
'prajjwal1/bert-tiny': [],
'google/electra-small-discriminator': []
}

for model in small_language_models:
embedder = Embedder(model=model, layer_ids="all")
model_name = embedder.model_name

for sentence in test_sentences:
embedding = embedder.embed(sentence)
embeddings[model_name].append(embedding)
assert embedding is not None and embedding != [], f"Empty or None embedding found for model {embedder.model_name}"

for model_name, sentence_embeddings in embeddings.items():
for embedding in sentence_embeddings:
assert embedding is not None and embedding != [], f"Empty or None embedding found for model {model_name}"


def test_embedder_outputs(small_language_models):
def test_embedder_word_level(small_language_models):
for model in small_language_models:
embedder = Embedder(model=model, layer_ids="all") # test word-level embedder
model_name = embedder.model_name
Expand All @@ -40,6 +30,8 @@ def test_embedder_outputs(small_language_models):
assert embedding.shape[:2] == (5, num_layers), \
f"Expected first two dimensions to be (5, {num_layers}), got {embedding.shape[:2]} using model {model_name}"


def test_embedder_sentence_pooling(small_language_models):
for model in small_language_models:
embedder = Embedder(model=model, layer_ids="all", sentence_pooling="mean") # test sentence-level embedder
model_name = embedder.model_name
Expand Down
4 changes: 2 additions & 2 deletions tests/test_ranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@


def test_ranker_trec(small_language_models, trec):
ranker = TransformerRanker(dataset=trec, dataset_downsample=0.2)
ranker = TransformerRanker(dataset=trec, dataset_downsample=0.05)
ranker.run(models=small_language_models, batch_size=64)


def test_ranker_conll(small_language_models, conll):
ranker = TransformerRanker(dataset=conll, dataset_downsample=0.2)
ranker = TransformerRanker(dataset=conll, dataset_downsample=0.01)
ranker.run(models=small_language_models, batch_size=64)

0 comments on commit bcc4d6f

Please sign in to comment.