Settings
(bacpipe.settings)
##################-------------------###################
################## PATH SETTINGS ###################
##################-------------------###################
# fixed path, embeddings will be stored here, advised not to change
# because this is also where bacpipe will look for existing embeddings
### IMPORTANT! WINODWS USERS DON'T USE QUOTATIONS HERE
main_results_dir : bacpipe_results
embed_parent_dir : embeddings
dim_reduc_parent_dir : dim_reduced_embeddings
evaluations_dir : evaluations
# fixed path, model checkpoints are/should be stored here
# this is also where bacpipe will look for existing checkpoints
model_base_path: bacpipe/model_checkpoints
#########-----------------------------------############
######### EMBEDDING GENERATION SETTINGS ############
#########-----------------------------------############
# specify your device, if unsure use 'cpu', if you are working on a
# gpu computer use 'cuda'
device: 'cpu'
# batch size for embedding generation, modify if you have memory issues
# or if you want to speed up the process and have enough memory available
global_batch_size: 8
# supported formats of audio files
audio_suffixes: ['.wav', '.WAV', '.aif', '.mp3', '.MP3', '.flac', '.ogg']
# Specify the padding strategy for the models. This will be used
# both if only annotated segments are embedded or if audio get's
# imported and windowed and the last frame might end up not being
# long enough to fit a full context window, as a result the frame
# get's padded. Specify how it should get padded.
# Choose values from the np.pad function documentation
# (https://numpy.org/devdocs/reference/generated/numpy.pad.html).
padding: 'wrap'
# To increase performance when running inference on a GPU, bacpipe will use threads
# to load audio data in one thread and compute embeddings in another. That way audio
# loading can happen while embeddings are being calculated. This is especially
# effective for datasets with a lot of small files and can separate the cpu and gpu
# tasks nicely. If you happen to have problems with this, you can set the following
# to true and thereby it will always revert to a sequential operation, also on gpu's.
avoid_pipelined_gpu_inference: False
# If using parallel processing, by default bacpipe will find all available cpu cores
# and subtract 1 from that number to determine the amount of available cpus. If you
# want to restrict this by setting a fixed number of cpus, specific the number here
nr_parallel_workers: False
# To avoid creating embeddings when configurations are still being tested, this is set to True.
# Set to False if you want to keep embeddings even if the process is interrupted.
rm_embedding_on_keyboard_interrupt: True
#########------------------------------------###########
######### USING YOUR ANOOTATION SETTINGS ###########
#########------------------------------------###########
# key corresponding to the label in the annotation.csv file, if labeled data exists
# by default this is species, but change if needed
label_column: 'species'
# file name of the annotations file. This file needs to be located
# in the root directory of your audio data.
annotations_filename : annotations.csv
# If you have an annotations file in the format specified in the
# ReadMe file, you can choose to only create embeddings for the segments
# corresponding to the annotations. For model's that require input segments
# longer than an annotation, the audio will be padded according to the
# padding specified below (see `padding`).
only_embed_annotations: False
##########-----------------------------#################
########## DEFAULT LABELS SETTINGS #################
##########-----------------------------#################
# kinds of default labels that are created for the embeddings.
# only modify these if you are ready to modify the
# corresponding code in label_embeddings.py
default_label_keys: [
"time_of_day",
"day_of_year",
"continuous_timestamp",
"parent_directory",
"audio_file_name"
]
############### EVALUATION SETTINGS ################
#######-------------------------------------############
####### INTEGRATED CLASSIFIER SETTINGS ############
#######-------------------------------------############
# For evaluation using classification it is necessary to have a minimum number
# of embeddings for each label. The remaining embeddings will be marked as noise.
# Change this value at will, but be aware that classification might fail if it's too low.
# Default = 150.
min_label_occurrences: 50
# If you want to visualize the embeddings by ground truth with no minimum
# number of occurances, set this to False.
bool_filter_labels: False
# if the model has a classifier that ships with the model, by default
# it will be used, to provide another option to colorcode embeddings
# and visualize differences between embeddings.
run_pretrained_classifier: True
classifier_threshold: 0.5
# By default all default classifier predictions will also be saved
# as raven tables. If you do not want this, set this to False.
save_raven_tables: True
#######----------------------############
####### PROBING SETTINGS ############
#######----------------------############
# train/test/validation ratio for linear probe training
train_ratio: 0.65
test_ratio: 0.2
# the rest will be val_ratio = 1 - (train_ratio + test_ratio)
# configurations for the probing, feel free to add more configurations
probe_configs:
config_1:
bool: True
name: "linear"
learning_rate: 0.0005
batch_size: 64
num_epochs: 20
dataset_csv_path: probing_dataframe.csv
shuffle: True
config_2:
bool: True
name: "knn"
n_neighbors: 15
dataset_csv_path: probing_dataframe.csv
# by default the trained linear probe will be saved
# along with the performance results and plots of the results
save_probe: True
#######-------------------------############
####### CLUSTERING SETTINGS ############
#######-------------------------############
# specify if you want to use the silhouette score to evaluate
# the clustering results. Can be very slow for large datasets
# and is not recommended for datasets with a large number of clusters.
evaluate_with_silhouette: False
# configurations for clustering, use the bool flag to enable/disable
# a configuration and feel free to add or modify the existing ones
clust_configs:
config_1:
bool: True
name: "kmeans"
# number of clusters is set to a default value of 18
# this is the default value can be adjusted as needed
params:
n_clusters: 18
config_2:
bool: False
name: "hdbscan" # not that this will require installing the hdbscan package
params:
min_cluster_size: 10
min_samples: 5
metric: "euclidean"
#######------------------------############
####### DASHBOARD SETTINGS ############
#######------------------------############
# port for dashboard
dashboard_port: 5006
# address for dashboard
dashboard_address: localhost
# if this is True, you can embed the dashboard
# into an existing html website
dashboard_websocket_origin: False
# figure height specifications for dashboard
embed_fig_height: 700
spectrogram_plot_height: 550
heatmap_fig_height: 600
accordion_width: 780
# color scales used in dashboard
spec_colorscale: 'Viridis'
color_continuous: 'Twilight'
#######-----------------------------------############
####### SLOW DOWN AUDIO SETTINGS ############
####### FOR ULTRASONIC VOCALIZATIONS ############
#######-----------------------------------############
# this is still in testing, but the goal is
# to slow down or speed up audio so that ultrasonic
# recordings can be reduced in apparent frequency and
# models like birdnet can process ultrasonic recordings
bool_change_speed: False
new_speed: 0.1
###########################################
# If set to true bacpipe will always use the same test files and save it in a
# directory other than the results directory. Results will also be deleted at
# of the computation.
testing: False