import bacpipe
import numpy as np
import re
from sklearn.metrics import classification_report
[docs]
def benchmark(
model, dataset,
annotations_file=None,
CustomModel=None,
check_if_already_processed=True,
**kwargs
):
"""
Benchmark a model's classifier performance for a dataset.
The dataset requires an annotation file that is located in
the root directory of the dataset. This annotation file has
needs to have the column names: `start`, `end`,
`audiofilename`, `label:species` so that the ground truth
can be extracted.
Ground truth is mapped to the timestamps so that predictions
and ground_truth have the same shape.
If predictions have already been produced this function runs
very quickly as it uses the saved data.
Finally the sklearn.metrics.classification_report function
is used to quantify the performance. The results are printed
as a report and returned as a dictionary.
This function expects a threshold. Threshold-independent
performance evaluation is currently not supported.
Parameters
----------
model : string
model name
dataset : string
path to audio dataset
annotations_file : string, optional
file name of annotations, by default None
CustomModel : class, optional
Custom model to use for the predictions, by default None
check_if_already_processed : bool, optional
if you want to force embeddings to be generated again,
set to True, defaults to True
Returns
-------
dict
dictionary containing report results, ground truth
array, predictions array, index to label dict and a list
of the species that weren't found in the classifier
class list
"""
print('Fetching ground truth and mapping it to model timestamps.\n')
gt = bacpipe.ground_truth_by_model(
model,
audio_dir=dataset,
annotations_filename=annotations_file,
single_label=False,
bool_filter_labels=False,
overwrite=True
)
loader_obj = bacpipe.run_pipeline_for_single_model(
model_name=model,
audio_dir=dataset,
CustomModel=CustomModel,
check_if_already_processed=check_if_already_processed,
**kwargs
)
print('\nFetching model predictions.\n')
preds, label2idx = loader_obj.predictions(return_type='array')
if preds is None:
return {'error': "No predictions have been generated, or model does not have classifier."}
# Align ground truth labels to predicted label indices
ground_truth_array = gt['label:species']
not_found = []
found = []
print(
'The following species were found in the ground truth '
'and the predictions:'
)
for label, idx in gt['label_dict:species'].items():
if label in label2idx:
print(label)
found.append(label)
ground_truth_array[gt['label:species'] == idx] = label2idx[label]
else:
not_found.append(label)
if not_found:
print(
'\nThese species were found in the ground truth but '
'NOT in the predictions:',
not_found
)
l2i_regex = {re.sub(r'[-\s]', '', label).lower(): i for label, i in label2idx.items()}
for label in not_found:
label_regex = re.sub(r'[-\s]', '', label).lower()
if label_regex in l2i_regex:
print('With regex we found', label)
found.append(label)
ground_truth_array[gt['label:species'] == idx] = l2i_regex[label_regex]
not_found.remove(label)
print('With regex we still did not find:', not_found)
if len(found) == 0:
return {'error': "No ground truth classes have been found in the predictions."}
# Build binary matrices using l2i as column ordering
n_timestamps = len(preds)
n_classes = len(label2idx)
gt_binary = np.zeros((n_timestamps, n_classes), dtype=int)
pred_binary = preds
pred_binary[pred_binary > 0] = 1
for label, col_idx in label2idx.items():
gt_binary[np.any(ground_truth_array == col_idx, axis=1), col_idx] = 1
# Filter to columns that appear in ground truth
gt_classes = set(ground_truth_array[ground_truth_array > -1].astype(int))
pred_classes = set(label2idx.values())
all_classes = gt_classes.intersection(pred_classes)
gt_binary = gt_binary[:, list(all_classes)]
pred_binary = pred_binary[:, list(all_classes)]
# Filter out unannotated timestamps
annotated_mask = gt_binary.sum(axis=1) > 0
gt_binary = gt_binary[annotated_mask]
pred_binary = pred_binary[annotated_mask]
print(f"annotated timestamps: {annotated_mask.sum()} of {n_timestamps}")
# Evaluate performance
idx2label = {v: k for k, v in label2idx.items()}
target_names = [idx2label[i] for i in all_classes]
report = classification_report(
gt_binary,
pred_binary,
target_names=target_names,
zero_division=0,
output_dict=True
)
print("\n--- Overall Report ---")
print(classification_report(
gt_binary,
pred_binary,
target_names=target_names,
zero_division=0
))
return {
'report': report,
'gt_binary': gt_binary,
'pred_binary': pred_binary,
'label2idx': label2idx,
'not_found': not_found
}