Skip to content

Benchmarking

Ruan Chaves edited this page Jun 3, 2023 · 4 revisions

The script on this page demonstrates how to evaluate a distilgpt2 model on 1000 hashtags. It can be easily modified to evaluate other models.

We collect our hashtags from 10 word segmentation datasets by taking the first 100 hashtags from each dataset.

For more information on the datasets, read the article 15 Datasets for Word Segmentation on the Hugging Face Hub.

from hashformers.experiments.evaluation import evaluate_df
import pandas as pd
from hashformers import TransformerWordSegmenter
from datasets import load_dataset

ws = TransformerWordSegmenter(
    segmenter_model_name_or_path="distilgpt2",
    segmenter_model_type="incremental",
    reranker_model_name_or_path=None,
    reranker_model_type=None,
)

user = "ruanchaves"

dataset_names = [
    "boun",
    "stan_small",
    "stan_large",
    "dev_stanford",
    "test_stanford",
    "snap",
    "hashset_distant",
    "hashset_manual",
    "hashset_distant_sampled",
    "nru_hse"
]

dataset_names = [ f"{user}/{dataset}" for dataset in dataset_names ]

def generate_experiments(datasets, splits, samples=100):
    for dataset_name in datasets:
        for split in splits:
            try:
                dataset = load_dataset(dataset_name, split=f"{split}[0:{samples}]")
                yield {
                    "dataset": dataset,
                    "split": split,
                    "name": dataset_name
                }
            except:
                continue

benchmark = []
for experiment in generate_experiments(dataset_names, ["train", "validation", "test"], samples=100):
    hashtags = experiment['dataset']['hashtag']
    annotations = experiment['dataset']['segmentation']
    segmentations = ws.segment(hashtags, use_reranker=False, return_ranks=False)

    eval_df = [{
      "gold": gold,
      "hashtags": hashtag,
      "segmentation": segmentation   
  } for gold, hashtag, segmentation in zip(annotations, hashtags, segmentations)]
    eval_df = pd.DataFrame(eval_df)
  
    eval_results = evaluate_df(
        eval_df,
        gold_field="gold",
        segmentation_field="segmentation"
    )

    eval_results.update({
      "name": experiment["name"],
      "split": experiment["split"]
      })
    benchmark.append(eval_results)

benchmark_df = pd.DataFrame(benchmark)
benchmark_df["name"] = benchmark_df["name"].apply(lambda x: x[(len(user) + 1):])
benchmark_df = benchmark_df.set_index(["name", "split"])
benchmark_df = benchmark_df.round(3)

print(benchmark_df)
Clone this wiki locally