Source code for drumscript.drum_classifier.classify

# DrumScript/drum_classifier/classify.py

"""
This script determines the classification rules by which the parameters in py are applied to audio_file_path.
It fuses high-resolution acoustic DNA extraction with simultaneous HFER/LFER physics rules.
Natively detects and filters isolated single-beat cymbals/kicks using Peak Dominance.
"""

import librosa
import numpy as np

from drumscript.notation_generator import constants
from drumscript.notation_generator.constants import (
    HAT_CLOSED_MAX_DECAY,
    HAT_OPEN_MAX_DECAY,
    HOP_LENGTH,
    IDIOPHONE_MIN_HFER_5K,
    KICK_FREQ_MAX,
    KICK_FREQ_MIN,
    KICK_LFER_MIN,
    N_FFT,
    ONSET_SLICE_DURATION_MS,
    SNARE_FREQ_MAX,
    SNARE_FREQ_MIN,
    SNARE_HFER_MIN,
    TOM_FREQ_LOW_MAX,
    TOM_FREQ_MID_MAX,
    TOM_MIN_DECAY,
)


[docs] def get_audio_slice(audio_path: np.ndarray, onset_time: float, sr: int) -> np.ndarray: """ Cuts a specific millisecond slice of audio starting exactly at the onset time. """ start_sample = int(onset_time * sr) # Convert duration from ms to seconds, then to samples duration_secs = ONSET_SLICE_DURATION_MS / 1000.0 end_sample = start_sample + int(duration_secs * sr) # Pad with zeros if the slice goes past the end of the audio file if end_sample > len(audio_path): slice_data = audio_path[start_sample:] pad_length = end_sample - len(audio_path) slice_data = np.pad(slice_data, (0, pad_length), mode="constant") return slice_data return audio_path[start_sample:end_sample]
[docs] def extract_features(audio_slice_short: np.ndarray, audio_slice_long: np.ndarray, sr: int) -> dict: """ Analyses the audio slice and extracts the physical DSP features. Uses a short 200ms slice for spectral purity, and a long 1.5s slice for decay. Wraps numpy outputs in float() to ensure JSON serialization. """ features = {} # 1. Compute the Frequency Spectrum (FFT) on SHORT slice stft = np.abs(librosa.stft(audio_slice_short, n_fft=N_FFT, hop_length=HOP_LENGTH)) spectrum = np.mean(stft, axis=1) frequencies = librosa.fft_frequencies(sr=sr, n_fft=N_FFT) # 2. Find the Peak Frequency (The strongest fundamental tone) peak_idx = np.argmax(spectrum) features["peak_freq"] = float(frequencies[peak_idx]) # 3. Calculate Spectral Centroid (The "Center of Mass" or Brightness) centroid = librosa.feature.spectral_centroid(S=stft, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH) features["centroid"] = float(np.mean(centroid)) # 4. Resonance (Decay Time) - ON LONG SLICE rms = librosa.feature.rms(y=audio_slice_long)[0] peak_rms_idx = np.argmax(rms) threshold = np.max(rms) * 0.1 # -20dB point decay_frames = 0 for i in range(peak_rms_idx, len(rms)): if rms[i] < threshold: break decay_frames += 1 features["decay"] = float(librosa.frames_to_time(decay_frames, sr=sr)) # 5. Calculate Energy Ratios (LFER & HFER) on SHORT slice total_energy = np.sum(spectrum) if total_energy == 0: features["lfer"] = 0.0 features["hfer"] = 0.0 features["hfer_5k"] = 0.0 return features # Low Frequency Energy Ratio (Energy below 150Hz) low_idx = np.where(frequencies <= 150.0)[0] features["lfer"] = float(np.sum(spectrum[low_idx]) / total_energy) # snare Wire Energy Ratio (Energy > 2000Hz) high_idx = np.where(frequencies > 2000.0)[0] features["hfer"] = float(np.sum(spectrum[high_idx]) / total_energy) # Cymbal/Hat Energy Ratio (Energy > 5000Hz) metal_idx = np.where(frequencies > 5000.0)[0] features["hfer_5k"] = float(np.sum(spectrum[metal_idx]) / total_energy) return features
[docs] def classify_membranophone(p): """ Stage 2A: Sorts skins (kick, snare, toms). """ detected_instruments = [] # RULE 1: KICK DRUM if p["lfer"] >= KICK_LFER_MIN and (KICK_FREQ_MIN <= p["peak_freq"] <= KICK_FREQ_MAX): # A kick is a short thud; a tom rings more. is_pure_tom = (p["hfer"] < SNARE_HFER_MIN) and (p["decay"] >= TOM_MIN_DECAY) if not is_pure_tom: detected_instruments.append("kick") # RULE 2: SNARE DRUM is_snare_freq = SNARE_FREQ_MIN <= p["peak_freq"] <= SNARE_FREQ_MAX has_snare_wire = SNARE_HFER_MIN <= p["hfer"] < 0.85 if has_snare_wire and is_snare_freq: detected_instruments.append("snare") # RULE 3: TOMS is_pure = p["hfer"] < SNARE_HFER_MIN is_resonant = p["decay"] >= TOM_MIN_DECAY if is_pure and is_resonant: if p["peak_freq"] <= TOM_FREQ_LOW_MAX: if "kick" not in detected_instruments: detected_instruments.append("low_tom") elif p["peak_freq"] <= TOM_FREQ_MID_MAX: detected_instruments.append("mid_tom") elif p["peak_freq"] <= 400: detected_instruments.append("high_tom") return detected_instruments
[docs] def classify_idiophone(p): """ Stage 2B: Sorts Metals (Hats, Cymbals). """ detected_instruments = [] decay = p["decay"] # RULE 4: METALS (Hats / Cymbals) if p["hfer_5k"] >= IDIOPHONE_MIN_HFER_5K: if decay <= HAT_CLOSED_MAX_DECAY: detected_instruments.append("hi_hat_closed") elif decay <= HAT_OPEN_MAX_DECAY: detected_instruments.append("hi_hat_open") else: # Raised Centroid from 2500 to 5500 to cleanly separate ride vs crash if p["centroid"] > 5500: detected_instruments.append("crash") else: detected_instruments.append("ride") return detected_instruments
[docs] def classify_event(physics): """ Stage 1: Evaluates both skins and Metals simultaneously. """ instruments = [] instruments.extend(classify_membranophone(physics)) instruments.extend(classify_idiophone(physics)) if not instruments: instruments.append("unknown") return instruments
[docs] def classify_rudiment_events(audio_path: np.ndarray, sr: int, onsets: list[float]) -> list[dict]: """ Dedicated classification engine for single beats, paradiddles, and rudiments. Uses strict, data-driven physics boundaries and ADSR Transient Gating to guarantee perfect single beats and clean ghost notes. """ classified_events = [] global_max = np.max(np.abs(audio_path)) if len(audio_path) > 0 else 1.0 for onset_time in onsets: start_sample = int(onset_time * sr) # 1. TIGHT SLICE PADDING (100ms) duration_short_secs = 0.100 end_sample_short = start_sample + int(duration_short_secs * sr) if end_sample_short > len(audio_path): slice_data = audio_path[start_sample:] pad_length = end_sample_short - len(audio_path) y_window_short = np.pad(slice_data, (0, pad_length), mode="constant") else: y_window_short = audio_path[start_sample:end_sample_short] if len(y_window_short) == 0: continue slice_max = np.max(np.abs(y_window_short)) if len(y_window_short) > 0 else 0.0 # 2. NOISE FLOOR GATE (10%) if slice_max < global_max * 0.10: continue # 3. LONG SLICE PADDING (1.5s) (for Tom Decay) duration_long_secs = 1.5 end_sample_long = start_sample + int(duration_long_secs * sr) if end_sample_long > len(audio_path): slice_data = audio_path[start_sample:] pad_length = end_sample_long - len(audio_path) y_window_long = np.pad(slice_data, (0, pad_length), mode="constant") else: y_window_long = audio_path[start_sample:end_sample_long] # Extract the physics DNA physics_profile = extract_features(y_window_short, y_window_long, sr) p = physics_profile instruments = [] # --- RUDIMENT PHYSICS RULES --- # 1. IS IT METAL OR SKIN? (Metals have > 20% energy above 5kHz) is_metal = p["hfer_5k"] > 0.20 if is_metal: # It's a Cymbal or Hat if p["decay"] <= constants.HAT_CLOSED_MAX_DECAY: instruments.append("hi_hat_closed") elif p["decay"] <= 0.50: instruments.append("hi_hat_open") else: # Ride vs Crash (Ride is darker < 6000Hz, Crash is brighter > 6000Hz) if p["centroid"] > 6000: instruments.append("crash") else: instruments.append("ride") else: # It's a Kick, Snare, or Tom is_kick_freq = p["peak_freq"] < 105.0 is_thump = p["lfer"] > 0.35 # Kick decay is short and punchy. add a tighter decay (< 0.40) and # a centroid check (> 1000) to ensure the beater click is present, # separating real kicks from muffled low toms and stem bleed. if is_kick_freq and is_thump and p["decay"] < 0.40 and p["centroid"] > 1000.0: instruments.append("kick") # Bumped the High-Frequency Energy Ratio to 0.22 to prevent punchy toms from bleeding in elif p["hfer"] > 0.22: instruments.append("snare") else: if p["decay"] > 0.75 or p["peak_freq"] < TOM_FREQ_LOW_MAX: instruments.append("low_tom") elif p["peak_freq"] <= TOM_FREQ_MID_MAX: instruments.append("mid_tom") else: instruments.append("high_tom") if not instruments: instruments.append("unknown") classified_events.append({"time_sec": float(onset_time), "instruments": instruments, "debug_features": physics_profile}) # --- TRANSIENT ATTACK GATING (kills wobbles/reverb/shimmer of idiophones) --- final_events = [] last_time = -999.0 for i, ev in enumerate(classified_events): time_s = ev["time_sec"] # Always keep the absolute first stick strike if i == 0: final_events.append(ev) last_time = time_s continue last_insts = final_events[-1]["instruments"] is_last_metal = any(inst in ["crash", "ride", "hi_hat_open", "hi_hat_closed"] for inst in last_insts) is_last_tom = any(inst in ["low_tom", "mid_tom", "high_tom"] for inst in last_insts) # Lockout (Allows 150BPM 16th notes = 100ms) # Toms are given a longer lockout (0.18s) to prevent their resonant wobble # from double-triggering as a ghost onset. if is_last_metal: lockout = 0.15 elif is_last_tom: lockout = 0.18 else: lockout = 0.09 if time_s - last_time < lockout: continue # --- ADSR TRANSIENT CHECK --- # Compare 30ms before onset to 50ms after onset. # A stick hit explodes in volume. A wobble/reverb just sustains smoothly. pre_start = max(0, int((time_s - 0.030) * sr)) pre_end = int(time_s * sr) post_end = min(len(audio_path), int((time_s + 0.050) * sr)) pre_data = audio_path[pre_start:pre_end] post_data = audio_path[pre_end:post_end] pre_vol = np.max(np.abs(pre_data)) if len(pre_data) > 0 else 0.0 post_vol = np.max(np.abs(post_data)) if len(post_data) > 0 else 0.0 # If the volume didn't jump by at least +25%, it's a fake resonance trigger if post_vol < pre_vol * 1.25: continue # --- OVERALL VOLUME GATE --- # Metals require 40% volume spike. Skins require 15% (for ghost notes). required_vol = global_max * 0.40 if is_last_metal else global_max * 0.15 if post_vol > required_vol: final_events.append(ev) last_time = time_s return final_events
[docs] def classify_events(audio_path: np.ndarray, sr: int, onsets: list[float]) -> list[dict]: """ Wrapper to route validated onsets through the Physics-First Classification Engine. Natively detects and filters isolated single-beat cymbals/kicks using Peak Dominance. All classification rules (membranophone/idiophone) are integrated natively. """ classified_events = [] global_max = np.max(np.abs(audio_path)) if len(audio_path) > 0 else 1.0 duration = len(audio_path) / sr # --- NATIVE PEAK DOMINANCE CHECK (Single-Beat Detection) --- loud_hit_count = 0 for t in onsets: s_start = int(t * sr) s_end = s_start + int(0.1 * sr) s_data = audio_path[s_start : min(s_end, len(audio_path))] s_vol = np.max(np.abs(s_data)) if len(s_data) > 0 else 0.0 if s_vol > global_max * 0.50: loud_hit_count += 1 # If the track is short and has exactly ONE loud hit, it is a single drum sample. is_single_beat = loud_hit_count == 1 and duration < 30.0 for onset_time in onsets: start_sample = int(onset_time * sr) # --- SHORT SLICE PADDING LOGIC (200ms) --- duration_short_secs = constants.ONSET_SLICE_DURATION_MS / 1000.0 end_sample_short = start_sample + int(duration_short_secs * sr) if end_sample_short > len(audio_path): slice_data = audio_path[start_sample:] pad_length = end_sample_short - len(audio_path) y_window_short = np.pad(slice_data, (0, pad_length), mode="constant") else: y_window_short = audio_path[start_sample:end_sample_short] if len(y_window_short) == 0: continue slice_max = np.max(np.abs(y_window_short)) if len(y_window_short) > 0 else 0.0 # --- SINGLE BEAT GATE --- if is_single_beat: # 1. Stricter gate for isolated cymbal/kick tails if slice_max < global_max * 0.50: continue # 2. De-Bounce Lockout if len(classified_events) > 0: last_time = classified_events[-1]["time_sec"] # Increased from 0.15 to 0.35 to deal with ride cymbal shimmers if float(onset_time) - last_time < 0.35: continue # --- LONG SLICE PADDING LOGIC (1.5s) --- duration_long_secs = 1.5 end_sample_long = start_sample + int(duration_long_secs * sr) if end_sample_long > len(audio_path): slice_data = audio_path[start_sample:] pad_length = end_sample_long - len(audio_path) y_window_long = np.pad(slice_data, (0, pad_length), mode="constant") else: y_window_long = audio_path[start_sample:end_sample_long] # 1. Extract the physics DNA physics_profile = extract_features(y_window_short, y_window_long, sr) # 2. Apply Classification Rules (Integrated Logic) instruments = [] # --- MEMBRANOPHONES (skins) --- # RULE 1: KICK DRUM if physics_profile["lfer"] >= KICK_LFER_MIN and (KICK_FREQ_MIN <= physics_profile["peak_freq"] <= KICK_FREQ_MAX): instruments.append("kick") # RULE 2: SNARE DRUM is_snare_freq = SNARE_FREQ_MIN <= physics_profile["peak_freq"] <= SNARE_FREQ_MAX has_snare_wire = SNARE_HFER_MIN <= physics_profile["hfer"] < 0.85 if has_snare_wire and is_snare_freq: instruments.append("snare") # RULE 3: TOMS is_pure = physics_profile["hfer"] < SNARE_HFER_MIN is_resonant = physics_profile["decay"] >= TOM_MIN_DECAY if is_pure and is_resonant: if physics_profile["peak_freq"] <= TOM_FREQ_LOW_MAX: if "kick" not in instruments: instruments.append("low_tom") elif physics_profile["peak_freq"] <= TOM_FREQ_MID_MAX: instruments.append("mid_tom") elif physics_profile["peak_freq"] <= 400: instruments.append("high_tom") # --- IDIOPHONES (Metals) --- # RULE 4: METALS (Hats / Cymbals) if physics_profile["hfer_5k"] >= IDIOPHONE_MIN_HFER_5K: if physics_profile["decay"] <= HAT_CLOSED_MAX_DECAY: instruments.append("hi_hat_closed") elif physics_profile["decay"] <= HAT_OPEN_MAX_DECAY: instruments.append("hi_hat_open") else: if physics_profile["centroid"] > 2500: instruments.append("crash") else: instruments.append("ride") # Fallback for undetected sounds if not instruments: instruments.append("unknown") # 3. Append with unified compatible keys classified_events.append({"time_sec": float(onset_time), "instruments": instruments, "debug_features": physics_profile}) return classified_events
# --------------------------------------------------------------------------uncomment during testing # from datetime import datetime # print("\n# ------------------------------------------------------------------------------------") # datetimestamp = datetime.now() # print(f'\ndate/time: {datetimestamp}') # -------------------------------------------------------------------------------------------------- """ LEGACY CODE (KEEP FOR ALPHA) # def classify_events(audio_path: np.ndarray, sr: int, onsets: list[float]) -> list[dict]: # Restored the explicit `physics_profile` variable name throughout the consolidated rule block for clarity and strict consistency with JSON exports. # Wrapper to route validated onsets through the Physics-First Classification Engine. # Natively detects and filters isolated single-beat cymbals/kicks using Peak Dominance. # All classification rules (membranophone/idiophone) are integrated natively. # from drumscript.notation_generator import constants # classified_events = [] # global_max = np.max(np.abs(audio_path)) if len(audio_path) > 0 else 1.0 # duration = len(audio_path) / sr # --- NATIVE PEAK DOMINANCE CHECK (Single-Beat Detection) --- # loud_hit_count = 0 # for t in onsets: # s_start = int(t * sr) # s_end = s_start + int(0.1 * sr) # s_data = audio_path[s_start:min(s_end, len(audio_path))] # s_vol = np.max(np.abs(s_data)) if len(s_data) > 0 else 0.0 # if s_vol > global_max * 0.50: # loud_hit_count += 1 # If the track is short and has exactly ONE loud hit, it is a single drum sample. # is_single_beat = (loud_hit_count == 1 and duration < 30.0) # for onset_time in onsets: # start_sample = int(onset_time * sr) # --- SHORT SLICE PADDING LOGIC (200ms) --- # duration_short_secs = constants.ONSET_SLICE_DURATION_MS / 1000.0 # end_sample_short = start_sample + int(duration_short_secs * sr) # if end_sample_short > len(audio_path): # slice_data = audio_path[start_sample:] # pad_length = end_sample_short - len(audio_path) # y_window_short = np.pad(slice_data, (0, pad_length), mode='constant') # else: # y_window_short = audio_path[start_sample:end_sample_short] # if len(y_window_short) == 0: # continue # slice_max = np.max(np.abs(y_window_short)) if len(y_window_short) > 0 else 0.0 # Standard safety: Drop absolute dead silence # if slice_max < 0.02 * global_max: # continue # --- SINGLE BEAT GATE --- # if is_single_beat: # 1. Strict Gate # if slice_max < global_max * 0.50: # continue # 2. De-Bounce Lockout # if len(classified_events) > 0: # last_time = classified_events[-1]["time_sec"] # if float(onset_time) - last_time < 0.15: # continue # --- LONG SLICE PADDING LOGIC (1.5s) --- # duration_long_secs = 1.5 # end_sample_long = start_sample + int(duration_long_secs * sr) # if end_sample_long > len(audio_path): # slice_data = audio_path[start_sample:] # pad_length = end_sample_long - len(audio_path) # y_window_long = np.pad(slice_data, (0, pad_length), mode='constant') # else: # y_window_long = audio_path[start_sample:end_sample_long] # 1. Extract the physics DNA # physics_profile = extract_features(y_window_short, y_window_long, sr) # 2. Apply Classification Rules (Integrated Logic) # instruments = [] # --- MEMBRANOPHONES (skins) --- # RULE 1: KICK DRUM # if physics_profile['lfer'] >= KICK_LFER_MIN and (KICK_FREQ_MIN <= physics_profile['peak_freq'] <= KICK_FREQ_MAX): # instruments.append('kick') # RULE 2: SNARE DRUM # is_snare_freq = (SNARE_FREQ_MIN <= physics_profile['peak_freq'] <= SNARE_FREQ_MAX) # has_snare_wire = (SNARE_HFER_MIN <= physics_profile['hfer'] < 0.85) # if has_snare_wire and is_snare_freq: # instruments.append('snare') # RULE 3: TOMS # is_pure = physics_profile['hfer'] < SNARE_HFER_MIN # is_resonant = physics_profile['decay'] >= TOM_MIN_DECAY # if is_pure and is_resonant: # if physics_profile['peak_freq'] <= TOM_FREQ_LOW_MAX: # if 'kick' not in instruments: # instruments.append('low_tom') # elif physics_profile['peak_freq'] <= TOM_FREQ_MID_MAX: # instruments.append('mid_tom') # elif physics_profile['peak_freq'] <= 400: # instruments.append('high_tom') # --- IDIOPHONES (Metals) --- # RULE 4: METALS (Hats / Cymbals) # if physics_profile['hfer_5k'] >= IDIOPHONE_MIN_HFER_5K: # if physics_profile['decay'] <= HAT_CLOSED_MAX_DECAY: # instruments.append('hi_hat_closed') # elif physics_profile['decay'] <= HAT_OPEN_MAX_DECAY: # instruments.append('hi_hat_open') # else: # if physics_profile['centroid'] > 2500: # instruments.append('crash') # else: # instruments.append('ride') # Fallback for undetected sounds # if not instruments: # instruments.append('unknown') # 3. Append with unified compatible keys # classified_events.append({ # "time_sec": float(onset_time), # "instruments": instruments, # "debug_features": physics_profile # }) # return classified_events ## --- LEGACY CODE --- # def classify_events(audio_path: np.ndarray, sr: int, onsets: list[float]) -> list[dict]: # # Wrapper to route detected onsets through the new Physics-First Classification Engine. # Uses the unified dictionary keys: time_sec, instruments, debug_features. # Natively detects and filters isolated single-beat cymbals/kicks using Peak Dominance. # # classified_events = [] # Calculate global parameters to evaluate amplitude gating for single hits # global_max = np.max(np.abs(audio_path)) if len(audio_path) > 0 else 1.0 # duration = len(audio_path) / sr # --- Peak Dominance Check (Single-Beat Detection) --- # Look ahead at the volume of all detected onsets. A ringing cymbal might hallucinate 30 fake onsets due to "shimmer", but only the initial stick # impact will be loud. # loud_hit_count = 0 # for t in onsets: # s_start = int(t * sr) # s_end = s_start + int(0.1 * sr) # Look at the first 100ms of the hit # s_data = audio_path[s_start:min(s_end, len(audio_path))] # s_vol = np.max(np.abs(s_data)) if len(s_data) > 0 else 0.0 # if s_vol > global_max * 0.50: # loud_hit_count += 1 # If the track is short and has exactly ONE loud hit, it is a single drum sample. # is_single_beat = (loud_hit_count == 1 and duration < 30.0) # is_isolated_sample = (loud_hit_count == 1 and duration < 30.0) # --- DYNAMIC ISOLATED SAMPLE DETECTION --- # Cymbals ring out for 5-15 seconds, bypassing our old 'duration < 2.0' check. # We use Onset Density instead. A single hit test sample is sparse (< 1.5 hits per second). # A real drum track is dense (e.g., 60bpm 8th notes = 2 hits per second). # onset_density = len(onsets) / duration if duration > 0 else 0 # is_isolated_sample = (duration < 20.0) and (onset_density < 1.5 or len(onsets) <= 5) # for onset_time in onsets: # start_sample = int(onset_time * sr) # --- SHORT SLICE PADDING LOGIC (200ms) --- # duration_short_secs = constants.ONSET_SLICE_DURATION_MS / 1000.0 # end_sample_short = start_sample + int(duration_short_secs * sr) # if end_sample_short > len(audio_path): # slice_data = audio_path[start_sample:] # pad_length = end_sample_short - len(audio_path) # y_window_short = np.pad(slice_data, (0, pad_length), mode='constant') # else: # y_window_short = audio_path[start_sample:end_sample_short] # # # if len(y_window_short) == 0: # continue # slice_max = np.max(np.abs(y_window_short)) if len(y_window_short) > 0 else 0.0 # Standard safety: Drop absolute dead silence to keep the noise floor clean # if slice_max < 0.02 * global_max: # continue # --- Single Beat 'Amplitude Gate' --- # For full songs, we do NOT interfere. We let the onset detector do its job so ghost notes are perfectly preserved (guaranteeing backward # compatibility). We only apply the 50% max volume drop to sparse, isolated single-beat samples # if is_isolated_sample: # 1. Strict Gate: Drop the quiet cymbal shimmers/kick sub-bass tails # slice_max = np.max(np.abs(y_window_short)) if len(y_window_short) > 0 else 0.0 # if slice_max < 0.5 * global_max: # continue # 2. De-Bounce Lockout: Prevent double-triggering within 150ms of the valid hit # if len(classified_events) > 0: # last_time = classified_events[-1]["time_sec"] # if float(onset_time) - last_time < 0.15: # continue # --- LONG SLICE PADDING LOGIC (1.5s) --- # duration_long_secs = 1.5 # end_sample_long = start_sample + int(duration_long_secs * sr) # if end_sample_long > len(audio_path): # slice_data = audio_path[start_sample:] # pad_length = end_sample_long - len(audio_path) # y_window_long = np.pad(slice_data, (0, pad_length), mode='constant') # else: # y_window_long = audio_path[start_sample:end_sample_long] # 1. Extract the physics DNA # physics_profile = extract_features(y_window_short, y_window_long, sr) # 2. Run the simultaneous rules # instruments = classify_event(physics_profile) # 3. Append with unified compatible keys # classified_events.append( # { # "time_sec": float(onset_time), # "instruments": instruments, # "debug_features": physics_profile # } # ) # return classified_events # def classify_events(audio_path: np.ndarray, sr: int, onsets: list[float]) -> list[dict]: # # Wrapper to route detected onsets through the new Physics-First Classification Engine. # Uses the unified dictionary keys: time_sec, instruments, debug_features. # # classified_events = [] # Calculate global parameters to evaluate amplitude gating for single hits # global_max = np.max(np.abs(audio_path)) if len(audio_path) > 0 else 1.0 # duration = len(audio_path) / sr # --- DYNAMIC ISOLATED SAMPLE DETECTION --- # Cymbals ring out for 5-15 seconds, bypassing our old 'duration < 2.0' check. # We use Onset Density instead. A single hit test sample is sparse (< 1.5 hits per second). # A real drum track is dense (e.g., 60bpm 8th notes = 2 hits per second). # onset_density = len(onsets) / duration if duration > 0 else 0 # is_isolated_sample = (duration < 20.0) and (onset_density < 1.5 or len(onsets) <= 5) # for onset_time in onsets: # start_sample = int(onset_time * sr) # --- SHORT SLICE PADDING LOGIC (200ms) --- # duration_short_secs = ONSET_SLICE_DURATION_MS / 1000.0 # end_sample_short = start_sample + int(duration_short_secs * sr) # if end_sample_short > len(audio_path): # slice_data = audio_path[start_sample:] # pad_length = end_sample_short - len(audio_path) # y_window_short = np.pad(slice_data, (0, pad_length), mode='constant') # else: # y_window_short = audio_path[start_sample:end_sample_short] # if len(y_window_short) == 0: # continue # --- STRICT SINGLE-BEAT AMPLITUDE GATE --- # For full songs, we do NOT interfere. We let the onset detector do its job # so ghost notes are perfectly preserved (guaranteeing backward compatibility). # We only apply the 50% max volume drop to sparse, isolated single-beat samples # if is_isolated_sample: # slice_max = np.max(np.abs(y_window_short)) if len(y_window_short) > 0 else 0.0 # if slice_max < 0.5 * global_max: # continue # --- LONG SLICE PADDING LOGIC (1.5s) --- # duration_long_secs = 1.5 # end_sample_long = start_sample + int(duration_long_secs * sr) # if end_sample_long > len(audio_path): # slice_data = audio_path[start_sample:] # pad_length = end_sample_long - len(audio_path) # y_window_long = np.pad(slice_data, (0, pad_length), mode='constant') # else: # y_window_long = audio_path[start_sample:end_sample_long] # 1. Extract the physics DNA # physics_profile = extract_features(y_window_short, y_window_long, sr) # 2. Run the simultaneous rules # instruments = classify_event(physics_profile) # 3. Append with unified compatible keys # classified_events.append( # { # "time_sec": float(onset_time), # "instruments": instruments, # "debug_features": physics_profile # } # ) # return classified_events # Reasoning: Replaced the static 2.0s duration check with an Onset Density check to safely gate long-ringing cymbals whilst protecting full songs. # --- LEGACY CODE -- # def classify_events(audio_path: np.ndarray, sr: int, onsets: list[float]) -> list[dict]: # Wrapper to route detected onsets through the new Physics-First Classification Engine. # Uses the unified dictionary keys: time_sec, instruments, debug_features. # classified_events = [] # Calculate global parameters to evaluate amplitude gating for single hits # global_max = np.max(np.abs(audio_path)) if len(audio_path) > 0 else 1.0 # duration = len(audio_path) / sr # for onset_time in onsets: # start_sample = int(onset_time * sr) # --- SHORT SLICE PADDING LOGIC (200ms) --- # duration_short_secs = ONSET_SLICE_DURATION_MS / 1000.0 # end_sample_short = start_sample + int(duration_short_secs * sr) # if end_sample_short > len(audio_path): # slice_data = audio_path[start_sample:] # pad_length = end_sample_short - len(audio_path) # y_window_short = np.pad(slice_data, (0, pad_length), mode='constant') # else: # y_window_short = audio_path[start_sample:end_sample_short] # if len(y_window_short) == 0: # continue # --- STRICT SINGLE-BEAT AMPLITUDE GATE --- # For full songs (> 2.0s), we do NOT interfere. We let the onset detector do its job # so ghost notes are perfectly preserved (guaranteeing backward compatibility). # We only apply the 50% max volume drop to short, isolated single-beat samples # if duration < 2.0: # slice_max = np.max(np.abs(y_window_short)) if len(y_window_short) > 0 else 0.0 # if len(classified_events) > 0 and slice_max < 0.5 * global_max: # continue # --- LONG SLICE PADDING LOGIC (1.5s) --- # duration_long_secs = 1.5 # end_sample_long = start_sample + int(duration_long_secs * sr) # if end_sample_long > len(audio_path): # slice_data = audio_path[start_sample:] # pad_length = end_sample_long - len(audio_path) # y_window_long = np.pad(slice_data, (0, pad_length), mode='constant') # else: # y_window_long = audio_path[start_sample:end_sample_long] # 1. Extract the physics DNA # physics_profile = extract_features(y_window_short, y_window_long, sr) # 2. Run the simultaneous rules # instruments = classify_event(physics_profile) # 3. Append with unified compatible keys # classified_events.append( # { # "time_sec": float(onset_time), # "instruments": instruments, # "debug_features": physics_profile # } # ) # return classified_events # Reasoning: Restricted the amplitude gate strictly to files under 2.0 seconds. This stops double-triggering on single beats while completely # bypassing full songs, ensuring 100% backward compatibility for tracks like "My Love for the Stars". # --- LEGACY CODE --- # Reasoning: Added a volume-based amplitude gate within the `classify_events` loop to discard quiet room reflections in short audio clips, # thereby stopping double-triggering without affecting full-song transcriptions. # def classify_events(audio_path: np.ndarray, sr: int, onsets: list[float]) -> list[dict]: # # # Wrapper to route detected onsets through the new Physics-First Classification Engine. # Uses the unified dictionary keys: time_sec, instruments, debug_features. # # # classified_events = [] # # for onset_time in onsets: # start_sample = int(onset_time * sr) # # # --- SHORT SLICE PADDING LOGIC (200ms) --- # duration_short_secs = ONSET_SLICE_DURATION_MS / 1000.0 # end_sample_short = start_sample + int(duration_short_secs * sr) # # if end_sample_short > len(audio_path): # slice_data = audio_path[start_sample:] # pad_length = end_sample_short - len(audio_path) # y_window_short = np.pad(slice_data, (0, pad_length), mode='constant') # else: # y_window_short = audio_path[start_sample:end_sample_short] # # if len(y_window_short) == 0: # continue # # # --- LONG SLICE PADDING LOGIC (1.5s) --- # duration_long_secs = 1.5 # end_sample_long = start_sample + int(duration_long_secs * sr) # # if end_sample_long > len(audio_path): # slice_data = audio_path[start_sample:] # pad_length = end_sample_long - len(audio_path) # y_window_long = np.pad(slice_data, (0, pad_length), mode='constant') # else: # y_window_long = audio_path[start_sample:end_sample_long] # # # # 1. Extract the physics DNA # physics_profile = extract_features(y_window_short, y_window_long, sr) # # # 2. Run the simultaneous rules # instruments = classify_event(physics_profile) # # # 3. Append with unified compatible keys # classified_events.append( # { # "time_sec": float(onset_time), # "instruments": instruments, # "debug_features": physics_profile # } # ) # # return classified_events # --- LEGACY CODE --- # def classify_events(audio_path: np.ndarray, sr: int, onsets: list[float]) -> list[dict]: # Wrapper to route detected onsets through the new Physics-First Classification Engine. # Uses the unified dictionary keys: time_sec, instruments, debug_features. # classified_events = [] # Calculate global parameters to evaluate amplitude gating # global_max = np.max(np.abs(audio_path)) if len(audio_path) > 0 else 1.0 # duration = len(audio_path) / sr # for onset_time in onsets: # start_sample = int(onset_time * sr) # --- SHORT SLICE PADDING LOGIC (200ms) --- # duration_short_secs = ONSET_SLICE_DURATION_MS / 1000.0 # end_sample_short = start_sample + int(duration_short_secs * sr) # if end_sample_short > len(audio_path): # slice_data = audio_path[start_sample:] # pad_length = end_sample_short - len(audio_path) # y_window_short = np.pad(slice_data, (0, pad_length), mode='constant') # else: # y_window_short = audio_path[start_sample:end_sample_short] # # # if len(y_window_short) == 0: # continue # # --- AMPLITUDE GATING FILTER --- # slice_max = np.max(np.abs(y_window_short)) if len(y_window_short) > 0 else 0.0 # # # Drop absolute silence (< 2% of max) # if slice_max < 0.02 * global_max: # continue # # # For isolated drum samples (< 2.0s), drop secondary quiet room reflections/tails (< 50% max) # if duration < 2.0 and len(classified_events) > 0: # if slice_max < 0.5 * global_max: # continue # # --- LONG SLICE PADDING LOGIC (1.5s) --- # duration_long_secs = 1.5 # end_sample_long = start_sample + int(duration_long_secs * sr) # if end_sample_long > len(audio_path): # slice_data = audio_path[start_sample:] # pad_length = end_sample_long - len(audio_path) # y_window_long = np.pad(slice_data, (0, pad_length), mode='constant') # else: # y_window_long = audio_path[start_sample:end_sample_long] # 1. Extract the physics DNA # physics_profile = extract_features(y_window_short, y_window_long, sr) # 2. Run the simultaneous rules # instruments = classify_event(physics_profile) # 3. Append with unified compatible keys # classified_events.append( # { # "time_sec": float(onset_time), # "instruments": instruments, # "debug_features": physics_profile # } # ) # return classified_events # --- EXTRACT_FEATURES --- # This restores the linear magnitude scale from _classifier.py so the thresholds # (like KICK_LFER_MIN and SNARE_HFER_MIN) work flawlessly again. # def extract_features(audio_slice: np.ndarray, sr: int) -> dict: # Analyses the audio slice and extracts the physical DSP features. # Wraps numpy outputs in float() to ensure JSON serialization. # features = {} # 1. Compute the Frequency Spectrum (FFT) # We use magnitude (abs) of the Short-Time Fourier Transform # stft = np.abs(librosa.stft(audio_slice, n_fft=N_FFT, hop_length=HOP_LENGTH)) # Average the spectrum across the tiny time slice to get one master frequency profile # spectrum = np.mean(stft, axis=1) # frequencies = librosa.fft_frequencies(sr=sr, n_fft=N_FFT) # 2. Find the Peak Frequency (The strongest fundamental tone) # peak_idx = np.argmax(spectrum) # features['peak_freq'] = float(frequencies[peak_idx]) # Added float() casting # 3. Calculate Spectral Centroid (The "Center of Mass" or Brightness) # centroid = librosa.feature.spectral_centroid(S=stft, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH) # features['centroid'] = float(np.mean(centroid)) # Added float() casting # 4. Resonance (Decay Time) - IMPORTED SO TOMS AND HATS CAN BE CLASSIFIED # rms = librosa.feature.rms(y=audio_slice)[0] # peak_rms_idx = np.argmax(rms) # threshold = np.max(rms) * 0.1 # -20dB point # decay_frames = 0 # for i in range(peak_rms_idx, len(rms)): # if rms[i] < threshold: # break # decay_frames += 1 # features['decay'] = float(librosa.frames_to_time(decay_frames, sr=sr)) # Added float() casting # 5. Calculate Energy Ratios (LFER & HFER) # total_energy = np.sum(spectrum) # if total_energy == 0: # Prevent division by zero on silent slices # features['lfer'] = 0.0 # features['hfer'] = 0.0 # features['hfer_5k'] = 0.0 # return features # Low Frequency Energy Ratio (Energy below 150Hz) # low_idx = np.where(frequencies <= 150.0)[0] # features['lfer'] = float(np.sum(spectrum[low_idx]) / total_energy) # Added float() casting # snare Wire Energy Ratio (Energy > 2000Hz) # high_idx = np.where(frequencies > 2000.0)[0] # features['hfer'] = float(np.sum(spectrum[high_idx]) / total_energy) # Added float() casting # Cymbal/Hat Energy Ratio (Energy > 5000Hz) # metal_idx = np.where(frequencies > 5000.0)[0] # features['hfer_5k'] = float(np.sum(spectrum[metal_idx]) / total_energy) # Added float() casting # return features # --- COMMENTED OUT GET_PHYSICS_PROFILE --- # REASON: The Welch method calculates Power (amplitude squared), which breaks # the LFER/HFER percentage thresholds. STFT magnitude (extract_features) restores accuracy. # def get_physics_profile(y, sr): # # # Extracts the 'DNA' of the drum hit: # Pitch, Decay, Brightness, and Energy Ratios. # # # # 1. Frequency Analysis (High Resolution) # freqs, psd = scipy.signal.welch(y, sr, nperseg=4096) # peak_idx = np.argmax(psd) # peak_freq = freqs[peak_idx] # # # 2. Spectral Centroid (Brightness - Critical for Cymbals) # centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)) # # # 3. Energy Distribution Ratios # total_energy = np.sum(psd) + 1e-9 # # # Bass Energy (<150Hz) - kick detection # low_energy = np.sum(psd[freqs < 150]) # lfer = low_energy / total_energy # # # Wire Energy (>2000Hz) - snare vs High Tom detection # mid_high_energy = np.sum(psd[freqs > 2000]) # hfer_2k = mid_high_energy / total_energy # # # Shimmer Energy (>5000Hz) - Skin vs Metal detection # high_energy = np.sum(psd[freqs > 5000]) # hfer_5k = high_energy / total_energy # # # 4. Resonance (Decay Time) # rms = librosa.feature.rms(y=y)[0] # peak_rms_idx = np.argmax(rms) # threshold = np.max(rms) * 0.1 # -20dB point # # decay_frames = 0 # for i in range(peak_rms_idx, len(rms)): # if rms[i] < threshold: # break # decay_frames += 1 # decay_time = librosa.frames_to_time(decay_frames, sr=sr) # # return { # "peak_freq": peak_freq, # "centroid": centroid, # "lfer": lfer, # "hfer_2k": hfer_2k, # "hfer": hfer_2k, # Mapped to 'hfer' to ensure compatibility with classify_onset # "hfer_5k": hfer_5k, # "decay": decay_time # } # --- LEGACY CODE - COMMENTED OUT CLASSIFY_MEMBRANOPHONE (March 17 Interim) --- # def classify_membranophone(p): # # # Stage 2A: Sorts skins (kick, snare, toms). # Updated to merge Feb 9 explicit logic with simultaneous hit routing. # # # detected_instruments = [] # # # RULE 1: KICK DRUM (Using exact Feb 9 explicit logic) # is_kick_freq = KICK_FREQ_MIN <= p['peak_freq'] <= KICK_FREQ_MAX # is_thump = p['lfer'] >= KICK_LFER_MIN # not_too_crisp = p['hfer'] < SNARE_HFER_MIN # Excludes fat snares # # if is_kick_freq and is_thump and not_too_crisp: # detected_instruments.append('kick') # # # RULE 2: SNARE DRUM (Standard + Fat snare catches) # is_snare_freq = (SNARE_FREQ_MIN <= p['peak_freq'] <= SNARE_FREQ_MAX) # has_snare_wire = (SNARE_HFER_MIN <= p['hfer'] < 0.85) # # if has_snare_wire and (is_snare_freq or p['peak_freq'] < SNARE_FREQ_MIN): # detected_instruments.append('snare') # # # RULE 3: TOMS (Integrating exact Feb 9 Purity & Resonance checks) # is_pure = p['hfer'] < SNARE_HFER_MIN # toms have almost no 'wire' noise # is_resonant = p['decay'] >= TOM_MIN_DECAY # toms ring longer than kicks # # if is_pure and is_resonant: # if p['peak_freq'] <= TOM_FREQ_LOW_MAX: # # Prevent duplicate tagging if decay pushed it into Low Tom territory # if 'low_tom' not in detected_instruments: # detected_instruments.append('low_tom') # elif p['peak_freq'] <= TOM_FREQ_MID_MAX: # detected_instruments.append('mid_tom') # elif p['peak_freq'] <= 400: # Upper safety bound # detected_instruments.append('high_tom') # # return detected_instruments # def classify_membranophone(p): # # Stage 2A: Sorts skins (kick, snare, toms). # Fuses logic from _classifier.py (kick/snare) with the Tom logic from v0.1.0. # # detected_instruments = [] # RULE 1: KICK DRUM (logic from _classifier.py) # Does not rely on decay, so overlapping cymbals won't turn kicks into Low toms # if p['lfer'] >= KICK_LFER_MIN and (KICK_FREQ_MIN <= p['peak_freq'] <= KICK_FREQ_MAX): # detected_instruments.append('kick') # --- LEGACY CODE - COMMENTED OUT SNARE RULE (Fat snare catch caused kick+Hat hallucinations) --- # is_snare_freq = (SNARE_FREQ_MIN <= p['peak_freq'] <= SNARE_FREQ_MAX) # has_snare_wire = (SNARE_HFER_MIN <= p['hfer'] < 0.85) # # if has_snare_wire and (is_snare_freq or p['peak_freq'] < SNARE_FREQ_MIN): # detected_instruments.append('snare') # RULE 2: SNARE DRUM (strict logic from _classifier.py to prevent kick+Hat hallucinations) # is_snare_freq = (SNARE_FREQ_MIN <= p['peak_freq'] <= SNARE_FREQ_MAX) # has_snare_wire = (SNARE_HFER_MIN <= p['hfer'] < 0.85) # if has_snare_wire and is_snare_freq: # detected_instruments.append('snare') # RULE 3: TOMS (From v0.1.0 - pure tone and longer decay) # is_pure = p['hfer'] < SNARE_HFER_MIN # toms have almost no 'wire' noise # is_resonant = p['decay'] >= TOM_MIN_DECAY # toms ring longer than isolated kicks # if is_pure and is_resonant: # if p['peak_freq'] <= TOM_FREQ_LOW_MAX: # if 'kick' not in detected_instruments: # Don't label a kick as a Low Tom # detected_instruments.append('low_tom') # elif p['peak_freq'] <= TOM_FREQ_MID_MAX: # detected_instruments.append('mid_tom') # elif p['peak_freq'] <= 400: # Upper safety bound # detected_instruments.append('high_tom') # return detected_instruments # --- LEGACY CODE - COMMENTED OUT CLASSIFY_IDIOPHONE (March 17 Interim) --- # def classify_idiophone(p): # # # Stage 2B: Sorts Metals (Hats, Cymbals). # Updated to return a list of simultaneous hits. # # # detected_instruments = [] # decay = p['decay'] # # # RULE 4: METALS (Hats / Cymbals) # if p['hfer_5k'] >= IDIOPHONE_MIN_HFER_5K: # if decay <= HAT_CLOSED_MAX_DECAY: # detected_instruments.append('hi_hat_closed') # elif decay <= HAT_OPEN_MAX_DECAY: # detected_instruments.append('hi_hat_open') # else: # if p['centroid'] > 2500: # detected_instruments.append('crash') # else: # detected_instruments.append('ride') # # return detected_instruments # def classify_idiophone(p): # Stage 2B: Sorts Metals (Hats, Cymbals). # Uses Feb 9 Decay logic + _classifier.py Centroid thresholds. # detected_instruments = [] # decay = p['decay'] # RULE 4: METALS (Hats / Cymbals) # if p['hfer_5k'] >= IDIOPHONE_MIN_HFER_5K: # if decay <= HAT_CLOSED_MAX_DECAY: # detected_instruments.append('hi_hat_closed') # elif decay <= HAT_OPEN_MAX_DECAY: # detected_instruments.append('hi_hat_open') # else: # Merged logic: use 2500 from _classifier to prevent kick overlaps looking like rides # if p['centroid'] > 2500: # detected_instruments.append('crash') # else: # detected_instruments.append('ride') # # return detected_instruments # --- LEGACY CODE - COMMENTED OUT CLASSIFY_EVENT --- # def classify_event(audio_segment, sr): # # # Stage 1: Class Separation (Skin vs Metal) # # # # physics = get_physics_profile(audio_segment, sr) # # # # # Is it Metal? (High energy > 5kHz) # # if physics['hfer_5k'] >= IDIOPHONE_MIN_HFER_5K: # # return classify_idiophone(physics) # # else: # # return classify_membranophone(physics) # def classify_event(audio_segment, sr): # # Stage 1: Evaluates both skins and Metals simultaneously. # Returns a list because multiple drums can hit simultaneously # LEGACY CODE - physics = get_physics_profile(audio_segment, sr) # physics = extract_features(audio_segment, sr) # instruments = [] # Check skins # instruments.extend(classify_membranophone(physics)) # Check metals # instruments.extend(classify_idiophone(physics)) # Fallback # if not instruments: # instruments.append('unknown') # return instruments # --- LEGACY CODE - COMMENTED OUT CLASSIFY_EVENTS --- # def classify_events(audio_path: np.ndarray, sr: int, onsets: list[float]) -> list[dict]: # # # Wrapper to route detected onsets through the Physics-First Classification Engine. # Uses the unified dictionary keys: time_sec, instruments, debug_features. # # # classified_events = [] # # for onset_time in onsets: # start_sample = int(onset_time * sr) # # # --- LEGACY CODE - TRUNCATING LOGIC --- # # end_sample = int((onset_time + 0.2) * sr) # # #duration_secs = ONSET_SLICE_DURATION_MS / 1000.0 # # #end_sample = start_sample + int(duration_secs * sr) # # # # if end_sample > len(audio_path): # # end_sample = len(audio_path) # # if start_sample >= end_sample: # # continue # # # # y_window = audio_path[start_sample:end_sample] # # if len(y_window) == 0: # # continue # # # --- PADDING LOGIC (Prevents SciPy/Librosa STFT Warnings at End of File) --- # duration_secs = ONSET_SLICE_DURATION_MS / 1000.0 # end_sample = start_sample + int(duration_secs * sr) # # if end_sample > len(audio_path): # slice_data = audio_path[start_sample:] # pad_length = end_sample - len(audio_path) # y_window = np.pad(slice_data, (0, pad_length), mode='constant') # else: # y_window = audio_path[start_sample:end_sample] # # if len(y_window) == 0: # continue # # # 1. Extract the physics DNA # physics_profile = get_physics_profile(y_window, sr) # # # 2. Run the simultaneous rules # instruments = classify_event(y_window, sr) # # # 3. Append with unified compatible keys # classified_events.append( # { # "time_sec": float(onset_time), # "instruments": instruments, # "debug_features": physics_profile # } # ) # # return classified_events # LEGACY CODE # def classify_events(audio_path: np.ndarray, sr: int, onsets: list[float]) -> list[dict]: # # Wrapper to route detected onsets through the Physics-First Classification Engine. # Uses the unified dictionary keys: time_sec, instruments, debug_features. # # classified_events = [] # for onset_time in onsets: # start_sample = int(onset_time * sr) # --- PADDING LOGIC (Prevents SciPy/Librosa STFT Warnings at End of File) --- # duration_secs = ONSET_SLICE_DURATION_MS / 1000.0 # end_sample = start_sample + int(duration_secs * sr) # if end_sample > len(audio_path): # slice_data = audio_path[start_sample:] # pad_length = end_sample - len(audio_path) # y_window = np.pad(slice_data, (0, pad_length), mode='constant') # else: # y_window = audio_path[start_sample:end_sample] # if len(y_window) == 0: # continue # 1. Extract the physics DNA # LEGACY CODE - physics_profile = get_physics_profile(y_window, sr) # physics_profile = extract_features(y_window, sr) # 2. Run the simultaneous rules # instruments = classify_event(y_window, sr) # 3. Append with unified compatible keys # classified_events.append( # { # "time_sec": float(onset_time), # "instruments": instruments, # "debug_features": physics_profile # } # ) # return classified_events # print("\n# ------------------------------------------------------------------------------------") # LEGACY CODE (PRESERVING FOR EASE) # Leave these uncommented so not to break orchestration and docs # def analyze_event(y, sr): # # # Calculates specific acoustic features: # # - f0: Fundamental Frequency (Peak Magnitude) # #- sc: Spectral Centroid (Brightness) # # - width: Spectral Bandwidth # # - depth: Decay Ratio (Sustain) # # #:param y: Audio segment. # #:type y: np.ndarray # #:param sr: Sampling rate. # #:type sr: int # #:return: Dictionary of features [f0: (Fundamental Frequency (Peak Magnitude)), # sc: Spectral Centroid (Brightness), width: Spectral Bandwidth], # depth: Decay Ratio (Sustain)] # #:rtype: dict # # # 1. FFT for Frequency Analysis # # High resolution (n_fft=2048) to see low frequencies clearly # # n_fft = 2048 # # spec = np.abs(librosa.stft(y, n_fft=n_fft)) # # freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft) # # n_fft = N_FFT # spec = np.abs(librosa.stft(y, n_fft=N_FFT)) # freqs = librosa.fft_frequencies(sr=sr, n_fft=N_FFT) # # # Sum magnitudes to find the strongest frequency (Fundamental) # sum_spec = np.sum(spec, axis=1) # peak_idx = np.argmax(sum_spec) # f0 = freqs[peak_idx] # # # 2. Spectral Features # # sc = float(np.mean(librosa.feature.spectral_centroid(S=spec, sr=sr))) # # width = float(np.mean(librosa.feature.spectral_bandwidth(S=spec, sr=sr))) # # sc = float(np.mean(librosa.feature.spectral_centroid(S=spec, sr=SAMPLE_RATE))) # width = float(np.mean(librosa.feature.spectral_bandwidth(S=spec, sr=SAMPLE_RATE))) # # 3. Depth / Decay (Sustain) # rms = librosa.feature.rms(y=y)[0] # split = len(rms) // 2 # if np.mean(rms[:split]) < 1e-5: # decay = 0.0 # else: # decay = np.mean(rms[split:]) / np.mean(rms[:split]) # # return { # "f0": float(f0), # "sc": float(round(sc, 2)), # "width": float(round(width, 2)), # "depth": float(round(decay, 2)), # } # # # # Leave these uncommented so not to break orchestration and docs # def classify_events_legacy (audio_path, sr, onsets) -> List[Dict[str, Any]]: # # Classifies hits strictly based on Fundamental Frequency ($f_0$) ranges. # # :param audio_path: Full audio array. # # :type audio_path: np.ndarray # # :param sr: Sampling rate. # # :type sr: int # # :param onsets: List of onset times. # # :type onsets: list # # :return: List of classified event dictionaries. # # :rtype: List[Dict[str, Any]] # # classified_events = [] # # for onset_time in onsets: # # Extract 150ms window for analysis # start_sample = int(onset_time * sr) # end_sample = int((onset_time + 0.15) * sr) # # if end_sample > len(audio_path): # end_sample = len(audio_path) # if start_sample >= end_sample: # continue # # y_window = audio_path[start_sample:end_sample] # if len(y_window) == 0: # continue # # # Analyze # features = analyze_event(y_window, sr) # f0 = features["f0"] # # drum_type = None # # # --- STRICT FREQUENCY RANGE CLASSIFICATION --- # # Order matters for overlaps # # # 1. Low End # if KICK_RANGE[0] <= f0 <= KICK_RANGE[1]: # drum_type = "kick" # elif LOW_TOM_RANGE[0] <= f0 <= LOW_TOM_RANGE[1]: # drum_type = "low_tom" # # # 2. Mids (Check Tom first to catch narrow band, then snare) # elif MID_TOM_RANGE[0] <= f0 <= MID_TOM_RANGE[1]: # drum_type = "mid_tom" # elif SNARE_RANGE[0] <= f0 <= SNARE_RANGE[1]: # drum_type = "snare" # # # 3. Highs # elif OPEN_HAT_RANGE[0] <= f0 <= OPEN_HAT_RANGE[1]: # drum_type = "hi_hat_open" # elif CLOSED_HAT_RANGE[0] <= f0 <= CLOSED_HAT_RANGE[1]: # drum_type = "hi_hat_closed" # elif RIDE_RANGE[0] <= f0 <= RIDE_RANGE[1]: # drum_type = "ride" # elif CRASH_RANGE[0] <= f0 <= CRASH_RANGE[1]: # drum_type = "crash" # # # If detected, append # if drum_type: # meta = DRUM_NOTATION_MAP[drum_type] # classified_events.append( # { # "drum_type": drum_type, # "onset_time_seconds": round(onset_time, 2), # "midi_pitch": meta["midi_program"], # "note_head_type": meta["note_head"], # "staff_position": meta["staff_position"], # "analysis": features, # } # ) # # return classified_events """