Settings

(bacpipe.settings)
##################-------------------###################
##################   PATH SETTINGS   ###################
##################-------------------###################

# fixed path, embeddings will be stored here, advised not to change
# because this is also where bacpipe will look for existing embeddings
### IMPORTANT! WINODWS USERS DON'T USE QUOTATIONS HERE
main_results_dir :      bacpipe_results
embed_parent_dir :      embeddings
dim_reduc_parent_dir :  dim_reduced_embeddings
evaluations_dir :       evaluations

# fixed path, model checkpoints are/should be stored here
# this is also where bacpipe will look for existing checkpoints
model_base_path: bacpipe/model_checkpoints


#########-----------------------------------############
#########   EMBEDDING GENERATION SETTINGS   ############
#########-----------------------------------############

# specify your device, if unsure use 'cpu', if you are working on a 
# gpu computer use 'cuda'
device: 'cpu'

# batch size for embedding generation, modify if you have memory issues
# or if you want to speed up the process and have enough memory available
global_batch_size: 8


# supported formats of audio files
audio_suffixes: ['.wav', '.WAV', '.aif', '.mp3', '.MP3', '.flac', '.ogg']

# Specify the padding strategy for the models. This will be used
# both if only annotated segments are embedded or if audio get's 
# imported and windowed and the last frame might end up not being 
# long enough to fit a full context window, as a result the frame 
# get's padded. Specify how it should get padded.
# Choose values from the np.pad function documentation
# (https://numpy.org/devdocs/reference/generated/numpy.pad.html).
padding: 'wrap'

# To increase performance when running inference on a GPU, bacpipe will use threads
# to load audio data in one thread and compute embeddings in another. That way audio 
# loading can happen while embeddings are being calculated. This is especially 
# effective for datasets with a lot of small files and can separate the cpu and gpu 
# tasks nicely. If you happen to have problems with this, you can set the following
# to true and thereby it will always revert to a sequential operation, also on gpu's.
avoid_pipelined_gpu_inference: False

# If using parallel processing, by default bacpipe will find all available cpu cores
# and subtract 1 from that number to determine the amount of available cpus. If you 
# want to restrict this by setting a fixed number of cpus, specific the number here
nr_parallel_workers: False

# To avoid creating embeddings when configurations are still being tested, this is set to True. 
# Set to False if you want to keep embeddings even if the process is interrupted.
rm_embedding_on_keyboard_interrupt: True

#########------------------------------------###########
#########   USING YOUR ANOOTATION SETTINGS   ###########
#########------------------------------------###########

# key corresponding to the label in the annotation.csv file, if labeled data exists
# by default this is species, but change if needed
label_column: 'species'

# file name of the annotations file. This file needs to be located
# in the root directory of your audio data.
annotations_filename :  annotations.csv

# If you have an annotations file in the format specified in the 
# ReadMe file, you can choose to only create embeddings for the segments
# corresponding to the annotations. For model's that require input segments
# longer than an annotation, the audio will be padded according to the 
# padding specified below (see `padding`).
only_embed_annotations: False

##########-----------------------------#################
##########   DEFAULT LABELS SETTINGS   #################
##########-----------------------------#################

# kinds of default labels that are created for the embeddings.
# only modify these if you are ready to modify the 
# corresponding code in label_embeddings.py
default_label_keys: [
  "time_of_day",
  "day_of_year",
  "continuous_timestamp",
  "parent_directory",
  "audio_file_name"
  ]


###############   EVALUATION SETTINGS ################

#######-------------------------------------############
#######   INTEGRATED CLASSIFIER  SETTINGS   ############
#######-------------------------------------############

# For evaluation using classification it is necessary to have a minimum number 
# of embeddings for each label. The remaining embeddings will be marked as noise.
# Change this value at will, but be aware that classification might fail if it's too low. 
# Default = 150.
min_label_occurrences: 50

# If you want to visualize the embeddings by ground truth with no minimum
# number of occurances, set this to False.
bool_filter_labels: False

# if the model has a classifier that ships with the model, by default 
# it will be used, to provide another option to colorcode embeddings
# and visualize differences between embeddings. 
run_pretrained_classifier: True
classifier_threshold: 0.5

# By default all default classifier predictions will also be saved
# as raven tables. If you do not want this, set this to False. 
save_raven_tables: True

#######----------------------############
#######   PROBING SETTINGS   ############
#######----------------------############

# train/test/validation ratio for linear probe training
train_ratio: 0.65
test_ratio: 0.2
# the rest will be val_ratio = 1 - (train_ratio + test_ratio)

# configurations for the probing, feel free to add more configurations
probe_configs:
  config_1:
    bool: True
    name: "linear"
    learning_rate: 0.0005
    batch_size: 64
    num_epochs: 20
    dataset_csv_path: probing_dataframe.csv
    shuffle: True
  config_2:
    bool: True
    name: "knn"
    n_neighbors: 15
    dataset_csv_path: probing_dataframe.csv

# by default the trained linear probe will be saved
# along with the performance results and plots of the results
save_probe: True

#######-------------------------############
#######   CLUSTERING SETTINGS   ############
#######-------------------------############

# specify if you want to use the silhouette score to evaluate 
# the clustering results. Can be very slow for large datasets
# and is not recommended for datasets with a large number of clusters.
evaluate_with_silhouette: False

# configurations for clustering, use the bool flag to enable/disable
# a configuration and feel free to add or modify the existing ones
clust_configs: 
  config_1: 
    bool: True
    name: "kmeans"
    # number of clusters is set to a default value of 18
    # this is the default value can be adjusted as needed
    params:
      n_clusters: 18
  config_2:
    bool: False
    name: "hdbscan" # not that this will require installing the hdbscan package
    params:
      min_cluster_size: 10
      min_samples: 5
      metric: "euclidean"
    

#######------------------------############
#######   DASHBOARD SETTINGS   ############
#######------------------------############

# port for dashboard
dashboard_port: 5006

# address for dashboard
dashboard_address: localhost

# if this is True, you can embed the dashboard
# into an existing html website
dashboard_websocket_origin: False

# figure height specifications for dashboard
embed_fig_height: 700
spectrogram_plot_height: 550
heatmap_fig_height: 600
accordion_width: 780

# color scales used in dashboard
spec_colorscale: 'Viridis'
color_continuous: 'Twilight'



#######-----------------------------------############
#######     SLOW DOWN AUDIO SETTINGS      ############
#######   FOR ULTRASONIC VOCALIZATIONS    ############
#######-----------------------------------############

# this is still in testing, but the goal is 
# to slow down or speed up audio so that ultrasonic
# recordings can be reduced in apparent frequency and
# models like birdnet can process ultrasonic recordings
bool_change_speed: False
new_speed: 0.1


###########################################

# If set to true bacpipe will always use the same test files and save it in a 
# directory other than the results directory. Results will also be deleted at 
# of the computation.
testing: False