Source code for bacpipe.embedding_evaluation.benchmark

import bacpipe
import numpy as np
import re

from sklearn.metrics import classification_report


[docs]
def benchmark(
    model, dataset, 
    annotations_file=None, 
    CustomModel=None,
    check_if_already_processed=True,
    **kwargs
    ):
    """
    Benchmark a model's classifier performance for a dataset.
    The dataset requires an annotation file that is located in
    the root directory of the dataset. This annotation file has
    needs to have the column names: `start`, `end`, 
    `audiofilename`, `label:species` so that the ground truth
    can be extracted. 
    Ground truth is mapped to the timestamps so that predictions
    and ground_truth have the same shape. 
    If predictions have already been produced this function runs
    very quickly as it uses the saved data.
    
    Finally the sklearn.metrics.classification_report function 
    is used to quantify the performance. The results are printed
    as a report and returned as a dictionary.
    This function expects a threshold. Threshold-independent
    performance evaluation is currently not supported.

    Parameters
    ----------
    model : string
        model name
    dataset : string
        path to audio dataset
    annotations_file : string, optional
        file name of annotations, by default None
    CustomModel : class, optional
        Custom model to use for the predictions, by default None
    check_if_already_processed : bool, optional
        if you want to force embeddings to be generated again, 
        set to True, defaults to True

    Returns
    -------
    dict
        dictionary containing report results, ground truth 
        array, predictions array, index to label dict and a list
        of the species that weren't found in the classifier 
        class list
    """
    print('Fetching ground truth and mapping it to model timestamps.\n')
    gt = bacpipe.ground_truth_by_model(
        model,
        audio_dir=dataset,
        annotations_filename=annotations_file,
        single_label=False,
        bool_filter_labels=False,
        overwrite=True
    )

    loader_obj = bacpipe.run_pipeline_for_single_model(
        model_name=model,
        audio_dir=dataset,
        CustomModel=CustomModel,
        check_if_already_processed=check_if_already_processed,
        **kwargs
    )
    
    print('\nFetching model predictions.\n')
    preds, label2idx = loader_obj.predictions(return_type='array')
    if preds is None:
        return {'error': "No predictions have been generated, or model does not have classifier."}
    
    # Align ground truth labels to predicted label indices
    ground_truth_array = gt['label:species']
    not_found = []
    found = []

    print(
        'The following species were found in the ground truth '
        'and the predictions:'
        )
    for label, idx in gt['label_dict:species'].items():
        if label in label2idx:
            print(label)
            found.append(label)
            ground_truth_array[gt['label:species'] == idx] = label2idx[label]
        else:
            not_found.append(label)
    
    if not_found:
        print(
            '\nThese species were found in the ground truth but '
            'NOT in the predictions:',
            not_found
            )
        l2i_regex = {re.sub(r'[-\s]', '', label).lower(): i for label, i in label2idx.items()}
        for label in not_found:
            label_regex = re.sub(r'[-\s]', '', label).lower()
            if label_regex in l2i_regex:
                print('With regex we found', label)
                found.append(label)
                ground_truth_array[gt['label:species'] == idx] = l2i_regex[label_regex]
                not_found.remove(label)
        print('With regex we still did not find:', not_found)
                
    if len(found) == 0:
        return {'error': "No ground truth classes have been found in the predictions."}
    # Build binary matrices using l2i as column ordering
    n_timestamps = len(preds)
    n_classes = len(label2idx)

    gt_binary = np.zeros((n_timestamps, n_classes), dtype=int)
    pred_binary = preds
    pred_binary[pred_binary > 0] = 1

    for label, col_idx in label2idx.items():
        gt_binary[np.any(ground_truth_array == col_idx, axis=1), col_idx] = 1

    # Filter to columns that appear in ground truth
    gt_classes = set(ground_truth_array[ground_truth_array > -1].astype(int))
    pred_classes = set(label2idx.values())
    all_classes = gt_classes.intersection(pred_classes)
    
    gt_binary = gt_binary[:, list(all_classes)]
    pred_binary = pred_binary[:, list(all_classes)]

    # Filter out unannotated timestamps
    annotated_mask = gt_binary.sum(axis=1) > 0
    gt_binary = gt_binary[annotated_mask]
    pred_binary = pred_binary[annotated_mask]

    print(f"annotated timestamps: {annotated_mask.sum()} of {n_timestamps}")

    # Evaluate performance
    idx2label = {v: k for k, v in label2idx.items()}
    target_names = [idx2label[i] for i in all_classes]

    report = classification_report(
        gt_binary,
        pred_binary,
        target_names=target_names,
        zero_division=0,
        output_dict=True
    )
    print("\n--- Overall Report ---")
    print(classification_report(
        gt_binary,
        pred_binary,
        target_names=target_names,
        zero_division=0
    ))

    return {
        'report': report,
        'gt_binary': gt_binary,
        'pred_binary': pred_binary,
        'label2idx': label2idx,
        'not_found': not_found
    }