Source code for soft_search.data.irr

#!/usr/bin/env python

from itertools import combinations
from pathlib import Path
from typing import Dict, List, Optional, Union

import pandas as pd
from statsmodels.stats.inter_rater import aggregate_raters, fleiss_kappa

from .soft_search_2022 import (
    SoftSearch2022IRRDatasetFields,
    load_soft_search_2022_training_irr,
)

###############################################################################


[docs] def calc_fleiss_kappa( data: Union[str, Path, pd.DataFrame], ) -> float: """ Calculate the Fleiss Kappa score as a metric for inter-rater reliability for the soft-search dataset. Parameters ---------- data: Union[str, Path, pd.DataFrame] The path to the dataset (as parquet) or an in-memory DataFrame. Returns ------- float The kappa statistic for the data. See Also -------- soft_search.data.soft_search_2022.load_soft_search_2022_training_irr The function to load the IRR data. Notes ----- See interpretation of Fleiss Kappa Statistic: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3900052/table/t3-biochem-med-22-3-276-4/?report=objectonly """ # Assume the data is the soft-search labelled dataset if isinstance(data, (str, Path)): data = pd.read_parquet(data) # Sort by link to have consistent order sorted_data = data.sort_values( by=[ SoftSearch2022IRRDatasetFields.github_link, ], ) # Make a frame of _just_ the annotation annotations: List[pd.Series] = [] for annotator_label in sorted_data[ SoftSearch2022IRRDatasetFields.annotator ].unique(): annotations.append( sorted_data.loc[ sorted_data[SoftSearch2022IRRDatasetFields.annotator] == annotator_label ][SoftSearch2022IRRDatasetFields.include_in_definition].values ) # Annotations merged together and ensured to be in subject as rows order annotations = pd.DataFrame(annotations).T # Aggregate agg_raters, _ = aggregate_raters(annotations) # Calc Kappa's and return return fleiss_kappa(agg_raters)