Spaces:

chenxie95
/

Language-Audio-Banquet

Runtime error

App Files Files Community

Jihuai commited on Sep 20

Commit

d572f56

0 Parent(s):

have to create an orphan branch to bypass large file history: cleanup .ipynb and create LFS

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.github/FUNDING.yml +14 -0
.gitignore +166 -0
.idea/.gitignore +8 -0
.idea/inspectionProfiles/Project_Default.xml +50 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +4 -0
.idea/modules.xml +8 -0
.idea/query-bandit.iml +8 -0
.idea/vcs.xml +6 -0
.vscode/launch.json +21 -0
LICENSE +21 -0
README.md +124 -0
assets/banquet-logo.png +0 -0
config/data/moisesdb-test.yml +55 -0
config/data/setup-a/moisesdb-vdb-query-d-aug.yml +63 -0
config/data/setup-a/moisesdb-vdb-query-d.yml +46 -0
config/data/setup-a/moisesdb-vdb-query.yml +46 -0
config/data/setup-b/moisesdb-vdbgp-query-d-aug-bal.yml +67 -0
config/data/setup-b/moisesdb-vdbgp-query-d-aug.yml +67 -0
config/data/setup-b/moisesdb-vdbgp-query-d.yml +50 -0
config/data/setup-b/moisesdb-vdbgp-query.yml +50 -0
config/data/setup-c/moisesdb-everything-query-d-aug-bal.yml +117 -0
config/data/setup-c/moisesdb-everything-query-d-aug.yml +117 -0
config/data/setup-c/moisesdb-everything-query-d-bal.yml +100 -0
config/data/setup-c/moisesdb-everything-query-d.yml +100 -0
config/data/vdbo/moisesdb-vdbo-aug.yml +35 -0
config/data/vdbo/moisesdb-vdbo.yml +18 -0
config/losses/both_l1snr.yml +4 -0
config/losses/both_l1snrdbm.yml +4 -0
config/models/bandit-query-pre.yml +31 -0
config/models/bandit-query-prefz.yml +31 -0
config/models/bandit-query.yml +29 -0
config/models/bandit-vdbo.yml +27 -0
config/optim/adam.yml +9 -0
config/trainer/default-long.yml +12 -0
config/trainer/default.yml +12 -0
core/__init__.py +0 -0
core/data/__init__.py +0 -0
core/data/base.py +138 -0
core/data/moisesdb/__init__.py +97 -0
core/data/moisesdb/audio.ipynb +76 -0
core/data/moisesdb/datamodule.py +239 -0
core/data/moisesdb/dataset.py +1383 -0
core/data/moisesdb/eda.ipynb +0 -0
core/data/moisesdb/npyify.py +923 -0
core/data/moisesdb/passt.ipynb +32 -0
core/losses/__init__.py +0 -0
core/losses/base.py +171 -0
core/losses/l1snr.py +110 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.pdf filter=lfs diff=lfs merge=lfs -text

.github/FUNDING.yml ADDED Viewed

	@@ -0,0 +1,14 @@

+# These are supported funding model platforms
+github: kwatcharasupat # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
+polar: # Replace with a single Polar username
+buy_me_a_coffee: # Replace with a single Buy Me a Coffee username
+custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']

.gitignore ADDED Viewed

	@@ -0,0 +1,166 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+input/
+output/
+logs/
+checkpoints/

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,50 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="DuplicatedCode" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <Languages>
+        <language minSize="52" name="Python" />
+      </Languages>
+    </inspection_tool>
+    <inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
+    <inspection_tool class="PyAttributeOutsideInitInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="8">
+            <item index="0" class="java.lang.String" itemvalue="pytorch_lightning" />
+            <item index="1" class="java.lang.String" itemvalue="torch" />
+            <item index="2" class="java.lang.String" itemvalue="torchaudio" />
+            <item index="3" class="java.lang.String" itemvalue="matplotlib" />
+            <item index="4" class="java.lang.String" itemvalue="ipython" />
+            <item index="5" class="java.lang.String" itemvalue="numpy" />
+            <item index="6" class="java.lang.String" itemvalue="opencv_python" />
+            <item index="7" class="java.lang.String" itemvalue="Pillow" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N806" />
+          <option value="N812" />
+          <option value="N802" />
+          <option value="N803" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyShadowingBuiltinsInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredNames">
+        <list>
+          <option value="round" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="SpellCheckingInspection" enabled="true" level="TYPO" enabled_by_default="true">
+      <option name="processCode" value="false" />
+      <option name="processLiterals" value="true" />
+      <option name="processComments" value="true" />
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (aa-listening-test-sigsep-gen)" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/query-bandit.iml" filepath="$PROJECT_DIR$/.idea/query-bandit.iml" />
+    </modules>
+  </component>
+</project>

.idea/query-bandit.iml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Remote Attach",
+            "type": "debugpy",
+            "request": "attach",
+            "justMyCode": true,
+            "connect": {
+                "host": "localhost",
+                "port": 5678
+            },
+            "pathMappings": [
+                {
+                    "localRoot": "${workspaceFolder}",
+                    "remoteRoot": "."
+                }
+            ]
+        }
+    ]
+}

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Karn Watcharasupat
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,124 @@

+# Language-Audio Banquet
+<a href='https://github.com/ModistAndrew/query-bandit'><img alt="Static Badge" src="https://img.shields.io/badge/github_repo-lightgrey?logo=github"></a>
+<a href='https://huggingface.co/spaces/chenxie95/Language-Audio-Banquet'><img alt="Static Badge" src="https://img.shields.io/badge/huggingface_space-yellow?logo=huggingface"></a>
+- Change the query embedding model from PaSST to CLAP, which supports language queries.
+- Change RNN to Transformer.
+- Some utility functions for inference.
+- (TODO) Train on more datasets.
+## Model weights
+You need to download the model weights from [huggingface model](https://huggingface.co/chenxie95/Language-Audio-Banquet-ckpt) and put them in `checkpoints/`. `bandit-vdbo-roformer.ckpt` is needed for training. `ev-pre.ckpt` and `ev-pre-aug.ckpt` can be choosen for inference.
+What's more, you need to download the query embedding model CLAP from [here](https://huggingface.co/lukewys/laion_clap/blob/main/music_speech_epoch_15_esc_89.25.pt) and put it in `checkpoints/querier/`.
+## Inference examples
+```bash
+export CONFIG_ROOT=./config
+python \
+# -m debugpy --listen 5678 --wait-for-client \
+train.py inference_byoq \
+  checkpoints/ev-pre-aug.ckpt \
+  input/491c1ff5-1e7b-4046-8029-a82d4a8aefb4.wav \
+  input/491c1ff5-1e7b-4046-8029-a82d4a8aefb4_bass.wav \
+  output/491c1ff5-1e7b-4046-8029-a82d4a8aefb4_bass.wav \
+  --batch_size=12 \
+  --use_cuda=true
+python \
+train.py inference_byoq_text \
+  checkpoints/ev-pre-aug.ckpt \
+  input/491c1ff5-1e7b-4046-8029-a82d4a8aefb4.wav \
+  piano \
+  output/491c1ff5-1e7b-4046-8029-a82d4a8aefb4_piano.wav \
+  --batch_size=12 \
+  --use_cuda=true
+python \
+train.py inference_test_folder \
+  checkpoints/ev-pre-aug.ckpt \
+  /inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/karaoke_converted/test \
+  output/karaoke \
+  bass \
+  --batch_size=30 \
+  --use_cuda=true \
+  --input_name=mixture
+```
+## Training examples
+```bash
+export CONFIG_ROOT=./config
+# export DATA_ROOT=/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data
+# export DATA_ROOT=/dev/shm
+export DATA_ROOT=/inspire/ssd/project/multilingualspeechrecognition/public
+export LOG_ROOT=./logs/ev-pre-aug-bal
+export CUDA_VISIBLE_DEVICES=0
+python \
+train.py train \
+  expt/setup-c/bandit-everything-query-pre-d-aug-bal.yml \
+  --ckpt_path=logs/ev-pre-aug-bal/e2e/HBRPOI/lightning_logs/version_1/checkpoints/last.ckpt
+# You may modify the batch size in yaml files in config/data/. A batch size of 3 fits on a NVIDIA 4090 (48GB).
+```
+---
+> ### Please consider giving back to the community if you have benefited from this work.
+>
+> If you've **benefited commercially from this work**, which we've poured significant effort into and released under permissive licenses, we hope you've found it valuable! While these licenses give you lots of freedom, we believe in nurturing a vibrant ecosystem where innovation can continue to flourish.
+>
+> So, as a gesture of appreciation and responsibility, we strongly urge commercial entities that have gained from this software to consider making voluntary contributions to music-related non-profit organizations of your choice. Your contribution directly helps support the foundational work that empowers your commercial success and ensures open-source innovation keeps moving forward.
+>
+> Some suggestions for the beneficiaries are provided [here](https://github.com/the-secret-source/nonprofits). Please do not hesitate to contribute to the list by opening pull requests there.
+---
+<div align="center">
+	<img src="assets/banquet-logo.png">
+</div>
+# Banquet: A Stem-Agnostic Single-Decoder System for Music Source Separation Beyond Four Stems
+Repository for **A Stem-Agnostic Single-Decoder System for Music Source Separation Beyond Four Stems**
+by Karn N. Watcharasupat and Alexander Lerch. [arXiv](https://arxiv.org/abs/2406.18747)
+> Despite significant recent progress across multiple subtasks of audio source separation, few music source separation systems support separation beyond the four-stem vocals, drums, bass, and other (VDBO) setup. Of the very few current systems that support source separation beyond this setup, most continue to rely on an inflexible decoder setup that can only support a fixed pre-defined set of stems. Increasing stem support in these inflexible systems correspondingly requires increasing computational complexity, rendering extensions of these systems computationally infeasible for long-tail instruments. In this work, we propose Banquet, a system that allows source separation of multiple stems using just one decoder. A bandsplit source separation model is extended to work in a query-based setup in tandem with a music instrument recognition PaSST model. On the MoisesDB dataset, Banquet, at only 24.9 M trainable parameters, approached the performance level of the significantly more complex 6-stem Hybrid Transformer Demucs on VDBO stems and outperformed it on guitar and piano. The query-based setup allows for the separation of narrow instrument classes such as clean acoustic guitars, and can be successfully applied to the extraction of less common stems such as reeds and organs.
+For the Cinematic Audio Source Separation model, Bandit, see [this repository](https://github.com/kwatcharasupat/bandit).
+## Inference
+```bash
+git clone https://github.com/kwatcharasupat/query-bandit.git
+cd query-bandit
+export CONFIG_ROOT="./config"
+python train.py inference_byoq \
+  --ckpt_path="/path/to/checkpoint/see-below.ckpt" \
+  --input_path="/path/to/input/file/fearOfMatlab.wav" \
+  --output_path="/path/to/output/file/fearOfMatlabStemEst/guitar.wav" \
+  --query_path="/path/to/query/file/random-guitar.wav" \
+  --batch_size=12 \
+  --use_cuda=true
+```
+Batch size of 12 _usually_ fits on a RTX 4090.
+### Model weights
+Model weights are available on Zenodo [here](https://zenodo.org/records/13694558).
+If you are not sure, use `ev-pre-aug.ckpt`.
+## Citation
+```
+@inproceedings{Watcharasupat2024Banquet,
+  title = {A Stem-Agnostic Single-Decoder System for Music Source Separation Beyond Four Stems},
+  booktitle = {To Appear in the Proceedings of the 25th International Society for Music Information Retrieval},
+  author = {Watcharasupat, Karn N. and Lerch, Alexander},
+  year = {2024},
+  month = {nov},
+  eprint = {2406.18747},
+  address = {San Francisco, CA, USA},
+}
+```

assets/banquet-logo.png ADDED Viewed

config/data/moisesdb-test.yml ADDED Viewed

	@@ -0,0 +1,55 @@

+data_root: ${oc.env:DATA_ROOT}/moisesdb
+cls: MoisesTestDataModule
+batch_size: 1
+effective_batch_size: null
+num_workers: 8
+inference_kwargs:
+  chunk_size_seconds: 6.0
+  hop_size_seconds: 0.5
+  batch_size: 12
+  fs: 44100
+test_kwargs:
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems: [
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      # "human_choir",
+      "background_vocals",
+      # "other_vocals",
+      "bass_guitar",
+      "bass_synthesizer",
+      # "contrabass_double_bass",
+      # "tuba",
+      # "bassoon",
+      "fx",
+      "clean_electric_guitar",
+      "distorted_electric_guitar",
+      # "lap_steel_guitar_or_slide_guitar",
+      "acoustic_guitar",
+      "other_plucked",
+      "pitched_percussion",
+      "grand_piano",
+      "electric_piano",
+      "organ_electric_organ",
+      "synth_pad",
+      "synth_lead",
+      # "violin",
+      # "viola",
+      # "cello",
+      # "violin_section",
+      # "viola_section",
+      # "cello_section",
+      "string_section",
+      "other_strings",
+      "brass",
+      # "flutes",
+      "reeds",
+      "other_wind"
+  ]
+  query_file: "query-10s"
+n_channels: 2

config/data/setup-a/moisesdb-vdb-query-d-aug.yml ADDED Viewed

	@@ -0,0 +1,63 @@

+data_root: ${oc.env:DATA_ROOT}/moisesdb
+cls: MoisesDataModule
+batch_size: 4
+effective_batch_size: null
+num_workers: 8
+train_kwargs:
+  target_length: 8192
+  chunk_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems:
+    [
+      "bass",
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      # "distorted_electric_guitar",
+      # "clean_electric_guitar",
+      # "acoustic_guitar",
+    ]
+  query_file: "query-10s"
+  augment:
+    - cls: Shift
+      kwargs:
+        p: 1.0
+        min_shift: -0.5
+        max_shift: 0.5
+    - cls: Gain
+      kwargs:
+        p: 1.0
+        min_gain_in_db: -6
+        max_gain_in_db: 6
+    - cls: ShuffleChannels
+      kwargs:
+        p: 0.5
+    - cls: PolarityInversion
+      kwargs:
+        p: 0.5
+val_kwargs:
+  chunk_size_seconds: 6.0
+  hop_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems:
+    [
+      "bass",
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      # "distorted_electric_guitar",
+      # "clean_electric_guitar",
+      # "acoustic_guitar",
+    ]
+  query_file: "query-10s"
+test_kwargs:
+  npy_memmap: true
+n_channels: 2

config/data/setup-a/moisesdb-vdb-query-d.yml ADDED Viewed

	@@ -0,0 +1,46 @@

+data_root: ${oc.env:DATA_ROOT}/moisesdb
+cls: MoisesDataModule
+batch_size: 4
+effective_batch_size: null
+num_workers: 8
+train_kwargs:
+  target_length: 8192
+  chunk_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems:
+    [
+      "bass",
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      # "distorted_electric_guitar",
+      # "clean_electric_guitar",
+      # "acoustic_guitar",
+    ]
+  query_file: "query-10s"
+val_kwargs:
+  chunk_size_seconds: 6.0
+  hop_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems:
+    [
+      "bass",
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      # "distorted_electric_guitar",
+      # "clean_electric_guitar",
+      # "acoustic_guitar",
+    ]
+  query_file: "query-10s"
+test_kwargs:
+  npy_memmap: true
+n_channels: 2

config/data/setup-a/moisesdb-vdb-query.yml ADDED Viewed

	@@ -0,0 +1,46 @@

+data_root: ${oc.env:DATA_ROOT}/moisesdb
+cls: MoisesDataModule
+batch_size: 4
+effective_batch_size: null
+num_workers: 8
+train_kwargs:
+  target_length: 8192
+  chunk_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: true
+  allowed_stems:
+    [
+      "bass",
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      # "distorted_electric_guitar",
+      # "clean_electric_guitar",
+      # "acoustic_guitar",
+    ]
+  query_file: "query-10s"
+val_kwargs:
+  chunk_size_seconds: 6.0
+  hop_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: true
+  allowed_stems:
+    [
+      "bass",
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      # "distorted_electric_guitar",
+      # "clean_electric_guitar",
+      # "acoustic_guitar",
+    ]
+  query_file: "query-10s"
+test_kwargs:
+  npy_memmap: true
+n_channels: 2

config/data/setup-b/moisesdb-vdbgp-query-d-aug-bal.yml ADDED Viewed

	@@ -0,0 +1,67 @@

+data_root: ${oc.env:DATA_ROOT}/moisesdb
+cls: MoisesBalancedTrainDataModule
+batch_size: 4
+effective_batch_size: null
+num_workers: 8
+train_kwargs:
+  target_length: 8192
+  chunk_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems:
+    [
+      "bass",
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      "distorted_electric_guitar",
+      "clean_electric_guitar",
+      "acoustic_guitar",
+      'grand_piano',
+      'electric_piano',
+    ]
+  query_file: "query-10s"
+  augment:
+    - cls: Shift
+      kwargs:
+        p: 1.0
+        min_shift: -0.5
+        max_shift: 0.5
+    - cls: Gain
+      kwargs:
+        p: 1.0
+        min_gain_in_db: -6
+        max_gain_in_db: 6
+    - cls: ShuffleChannels
+      kwargs:
+        p: 0.5
+    - cls: PolarityInversion
+      kwargs:
+        p: 0.5
+val_kwargs:
+  chunk_size_seconds: 6.0
+  hop_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems:
+    [
+      "bass",
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      "distorted_electric_guitar",
+      "clean_electric_guitar",
+      "acoustic_guitar",
+      'grand_piano',
+      'electric_piano',
+    ]
+  query_file: "query-10s"
+test_kwargs:
+  npy_memmap: true
+n_channels: 2

config/data/setup-b/moisesdb-vdbgp-query-d-aug.yml ADDED Viewed

	@@ -0,0 +1,67 @@

+data_root: ${oc.env:DATA_ROOT}/moisesdb
+cls: MoisesDataModule
+batch_size: 4
+effective_batch_size: null
+num_workers: 8
+train_kwargs:
+  target_length: 8192
+  chunk_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems:
+    [
+      "bass",
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      "distorted_electric_guitar",
+      "clean_electric_guitar",
+      "acoustic_guitar",
+      'grand_piano',
+      'electric_piano',
+    ]
+  query_file: "query-10s"
+  augment:
+    - cls: Shift
+      kwargs:
+        p: 1.0
+        min_shift: -0.5
+        max_shift: 0.5
+    - cls: Gain
+      kwargs:
+        p: 1.0
+        min_gain_in_db: -6
+        max_gain_in_db: 6
+    - cls: ShuffleChannels
+      kwargs:
+        p: 0.5
+    - cls: PolarityInversion
+      kwargs:
+        p: 0.5
+val_kwargs:
+  chunk_size_seconds: 6.0
+  hop_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems:
+    [
+      "bass",
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      "distorted_electric_guitar",
+      "clean_electric_guitar",
+      "acoustic_guitar",
+      'grand_piano',
+      'electric_piano',
+    ]
+  query_file: "query-10s"
+test_kwargs:
+  npy_memmap: true
+n_channels: 2

config/data/setup-b/moisesdb-vdbgp-query-d.yml ADDED Viewed

	@@ -0,0 +1,50 @@

+data_root: ${oc.env:DATA_ROOT}/moisesdb
+cls: MoisesDataModule
+batch_size: 4
+effective_batch_size: null
+num_workers: 8
+train_kwargs:
+  target_length: 8192
+  chunk_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems:
+    [
+      "bass",
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      "distorted_electric_guitar",
+      "clean_electric_guitar",
+      "acoustic_guitar",
+      'grand_piano',
+      'electric_piano',
+    ]
+  query_file: "query-10s"
+val_kwargs:
+  chunk_size_seconds: 6.0
+  hop_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems:
+    [
+      "bass",
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      "distorted_electric_guitar",
+      "clean_electric_guitar",
+      "acoustic_guitar",
+      'grand_piano',
+      'electric_piano',
+    ]
+  query_file: "query-10s"
+test_kwargs:
+  npy_memmap: true
+n_channels: 2

config/data/setup-b/moisesdb-vdbgp-query.yml ADDED Viewed

	@@ -0,0 +1,50 @@

+data_root: ${oc.env:DATA_ROOT}/moisesdb
+cls: MoisesDataModule
+batch_size: 4
+effective_batch_size: null
+num_workers: 8
+train_kwargs:
+  target_length: 8192
+  chunk_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: true
+  allowed_stems:
+    [
+      "bass",
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      "distorted_electric_guitar",
+      "clean_electric_guitar",
+      "acoustic_guitar",
+      'grand_piano',
+      'electric_piano',
+    ]
+  query_file: "query-10s"
+val_kwargs:
+  chunk_size_seconds: 6.0
+  hop_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: true
+  allowed_stems:
+    [
+      "bass",
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      "distorted_electric_guitar",
+      "clean_electric_guitar",
+      "acoustic_guitar",
+      'grand_piano',
+      'electric_piano',
+    ]
+  query_file: "query-10s"
+test_kwargs:
+  npy_memmap: true
+n_channels: 2

config/data/setup-c/moisesdb-everything-query-d-aug-bal.yml ADDED Viewed

	@@ -0,0 +1,117 @@

+data_root: ${oc.env:DATA_ROOT}/moisesdb
+cls: MoisesBalancedTrainDataModule
+batch_size: 3
+effective_batch_size: null
+num_workers: 8
+train_kwargs:
+  target_length: 8192
+  chunk_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems: [
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      # "human_choir",
+      "background_vocals",
+      # "other_vocals",
+      "bass_guitar",
+      "bass_synthesizer",
+      # "contrabass_double_bass",
+      # "tuba",
+      # "bassoon",
+      "fx",
+      "clean_electric_guitar",
+      "distorted_electric_guitar",
+      # "lap_steel_guitar_or_slide_guitar",
+      "acoustic_guitar",
+      "other_plucked",
+      "pitched_percussion",
+      "grand_piano",
+      "electric_piano",
+      "organ_electric_organ",
+      "synth_pad",
+      "synth_lead",
+      # "violin",
+      # "viola",
+      # "cello",
+      # "violin_section",
+      # "viola_section",
+      # "cello_section",
+      "string_section",
+      "other_strings",
+      "brass",
+      # "flutes",
+      "reeds",
+      "other_wind"
+  ]
+  query_file: "query-10s"
+  augment:
+    - cls: Shift
+      kwargs:
+        p: 1.0
+        min_shift: -0.5
+        max_shift: 0.5
+    - cls: Gain
+      kwargs:
+        p: 1.0
+        min_gain_in_db: -6
+        max_gain_in_db: 6
+    - cls: ShuffleChannels
+      kwargs:
+        p: 0.5
+    - cls: PolarityInversion
+      kwargs:
+        p: 0.5
+val_kwargs:
+  chunk_size_seconds: 6.0
+  hop_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems: [
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      # "human_choir",
+      "background_vocals",
+      # "other_vocals",
+      "bass_guitar",
+      "bass_synthesizer",
+      # "contrabass_double_bass",
+      # "tuba",
+      # "bassoon",
+      "fx",
+      "clean_electric_guitar",
+      "distorted_electric_guitar",
+      # "lap_steel_guitar_or_slide_guitar",
+      "acoustic_guitar",
+      "other_plucked",
+      "pitched_percussion",
+      "grand_piano",
+      "electric_piano",
+      "organ_electric_organ",
+      "synth_pad",
+      "synth_lead",
+      # "violin",
+      # "viola",
+      # "cello",
+      # "violin_section",
+      # "viola_section",
+      # "cello_section",
+      "string_section",
+      "other_strings",
+      "brass",
+      # "flutes",
+      "reeds",
+      "other_wind"
+  ]
+  query_file: "query-10s"
+test_kwargs:
+  npy_memmap: true
+n_channels: 2

config/data/setup-c/moisesdb-everything-query-d-aug.yml ADDED Viewed

	@@ -0,0 +1,117 @@

+data_root: ${oc.env:DATA_ROOT}/moisesdb
+cls: MoisesDataModule
+batch_size: 3
+effective_batch_size: null
+num_workers: 8
+train_kwargs:
+  target_length: 8192
+  chunk_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems: [
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      # "human_choir",
+      "background_vocals",
+      # "other_vocals",
+      "bass_guitar",
+      "bass_synthesizer",
+      # "contrabass_double_bass",
+      # "tuba",
+      # "bassoon",
+      "fx",
+      "clean_electric_guitar",
+      "distorted_electric_guitar",
+      # "lap_steel_guitar_or_slide_guitar",
+      "acoustic_guitar",
+      "other_plucked",
+      "pitched_percussion",
+      "grand_piano",
+      "electric_piano",
+      "organ_electric_organ",
+      "synth_pad",
+      "synth_lead",
+      # "violin",
+      # "viola",
+      # "cello",
+      # "violin_section",
+      # "viola_section",
+      # "cello_section",
+      "string_section",
+      "other_strings",
+      "brass",
+      # "flutes",
+      "reeds",
+      "other_wind"
+  ]
+  query_file: "query-10s"
+  augment:
+    - cls: Shift
+      kwargs:
+        p: 1.0
+        min_shift: -0.5
+        max_shift: 0.5
+    - cls: Gain
+      kwargs:
+        p: 1.0
+        min_gain_in_db: -6
+        max_gain_in_db: 6
+    - cls: ShuffleChannels
+      kwargs:
+        p: 0.5
+    - cls: PolarityInversion
+      kwargs:
+        p: 0.5
+val_kwargs:
+  chunk_size_seconds: 6.0
+  hop_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems: [
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      # "human_choir",
+      "background_vocals",
+      # "other_vocals",
+      "bass_guitar",
+      "bass_synthesizer",
+      # "contrabass_double_bass",
+      # "tuba",
+      # "bassoon",
+      "fx",
+      "clean_electric_guitar",
+      "distorted_electric_guitar",
+      # "lap_steel_guitar_or_slide_guitar",
+      "acoustic_guitar",
+      "other_plucked",
+      "pitched_percussion",
+      "grand_piano",
+      "electric_piano",
+      "organ_electric_organ",
+      "synth_pad",
+      "synth_lead",
+      # "violin",
+      # "viola",
+      # "cello",
+      # "violin_section",
+      # "viola_section",
+      # "cello_section",
+      "string_section",
+      "other_strings",
+      "brass",
+      # "flutes",
+      "reeds",
+      "other_wind"
+  ]
+  query_file: "query-10s"
+test_kwargs:
+  npy_memmap: true
+n_channels: 2

config/data/setup-c/moisesdb-everything-query-d-bal.yml ADDED Viewed

	@@ -0,0 +1,100 @@

+data_root: ${oc.env:DATA_ROOT}/moisesdb
+cls: MoisesBalancedTrainDataModule
+batch_size: 3
+effective_batch_size: null
+num_workers: 8
+train_kwargs:
+  target_length: 8192
+  chunk_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems: [
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      # "human_choir",
+      "background_vocals",
+      # "other_vocals",
+      "bass_guitar",
+      "bass_synthesizer",
+      # "contrabass_double_bass",
+      # "tuba",
+      # "bassoon",
+      "fx",
+      "clean_electric_guitar",
+      "distorted_electric_guitar",
+      # "lap_steel_guitar_or_slide_guitar",
+      "acoustic_guitar",
+      "other_plucked",
+      "pitched_percussion",
+      "grand_piano",
+      "electric_piano",
+      "organ_electric_organ",
+      "synth_pad",
+      "synth_lead",
+      # "violin",
+      # "viola",
+      # "cello",
+      # "violin_section",
+      # "viola_section",
+      # "cello_section",
+      "string_section",
+      "other_strings",
+      "brass",
+      # "flutes",
+      "reeds",
+      "other_wind"
+  ]
+  query_file: "query-10s"
+val_kwargs:
+  chunk_size_seconds: 6.0
+  hop_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems: [
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      # "human_choir",
+      "background_vocals",
+      # "other_vocals",
+      "bass_guitar",
+      "bass_synthesizer",
+      # "contrabass_double_bass",
+      # "tuba",
+      # "bassoon",
+      "fx",
+      "clean_electric_guitar",
+      "distorted_electric_guitar",
+      # "lap_steel_guitar_or_slide_guitar",
+      "acoustic_guitar",
+      "other_plucked",
+      "pitched_percussion",
+      "grand_piano",
+      "electric_piano",
+      "organ_electric_organ",
+      "synth_pad",
+      "synth_lead",
+      # "violin",
+      # "viola",
+      # "cello",
+      # "violin_section",
+      # "viola_section",
+      # "cello_section",
+      "string_section",
+      "other_strings",
+      "brass",
+      # "flutes",
+      "reeds",
+      "other_wind"
+  ]
+  query_file: "query-10s"
+test_kwargs:
+  npy_memmap: true
+n_channels: 2

config/data/setup-c/moisesdb-everything-query-d.yml ADDED Viewed

	@@ -0,0 +1,100 @@

+data_root: ${oc.env:DATA_ROOT}/moisesdb
+cls: MoisesDataModule
+batch_size: 3
+effective_batch_size: null
+num_workers: 8
+train_kwargs:
+  target_length: 8192
+  chunk_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems: [
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      # "human_choir",
+      "background_vocals",
+      # "other_vocals",
+      "bass_guitar",
+      "bass_synthesizer",
+      # "contrabass_double_bass",
+      # "tuba",
+      # "bassoon",
+      "fx",
+      "clean_electric_guitar",
+      "distorted_electric_guitar",
+      # "lap_steel_guitar_or_slide_guitar",
+      "acoustic_guitar",
+      "other_plucked",
+      "pitched_percussion",
+      "grand_piano",
+      "electric_piano",
+      "organ_electric_organ",
+      "synth_pad",
+      "synth_lead",
+      # "violin",
+      # "viola",
+      # "cello",
+      # "violin_section",
+      # "viola_section",
+      # "cello_section",
+      "string_section",
+      "other_strings",
+      "brass",
+      # "flutes",
+      "reeds",
+      "other_wind"
+  ]
+  query_file: "query-10s"
+val_kwargs:
+  chunk_size_seconds: 6.0
+  hop_size_seconds: 6.0
+  query_size_seconds: 10.0
+  top_k_instrument: 10
+  npy_memmap: true
+  mixture_stem: mixture
+  use_own_query: false
+  allowed_stems: [
+      "drums",
+      "lead_male_singer",
+      "lead_female_singer",
+      # "human_choir",
+      "background_vocals",
+      # "other_vocals",
+      "bass_guitar",
+      "bass_synthesizer",
+      # "contrabass_double_bass",
+      # "tuba",
+      # "bassoon",
+      "fx",
+      "clean_electric_guitar",
+      "distorted_electric_guitar",
+      # "lap_steel_guitar_or_slide_guitar",
+      "acoustic_guitar",
+      "other_plucked",
+      "pitched_percussion",
+      "grand_piano",
+      "electric_piano",
+      "organ_electric_organ",
+      "synth_pad",
+      "synth_lead",
+      # "violin",
+      # "viola",
+      # "cello",
+      # "violin_section",
+      # "viola_section",
+      # "cello_section",
+      "string_section",
+      "other_strings",
+      "brass",
+      # "flutes",
+      "reeds",
+      "other_wind"
+  ]
+  query_file: "query-10s"
+test_kwargs:
+  npy_memmap: true
+n_channels: 2

config/data/vdbo/moisesdb-vdbo-aug.yml ADDED Viewed

	@@ -0,0 +1,35 @@

+data_root: ${oc.env:DATA_ROOT}/moisesdb
+cls: MoisesVDBODataModule
+batch_size: 4
+effective_batch_size: null
+num_workers: 8
+train_kwargs:
+  target_length: 8192
+  chunk_size_seconds: 6.0
+  fs: 44100
+  npy_memmap: true
+  augment:
+    - cls: Shift
+      kwargs:
+        p: 1.0
+        min_shift: -0.5
+        max_shift: 0.5
+    - cls: Gain
+      kwargs:
+        p: 1.0
+        min_gain_in_db: -6
+        max_gain_in_db: 6
+    - cls: ShuffleChannels
+      kwargs:
+        p: 0.5
+    - cls: PolarityInversion
+      kwargs:
+        p: 0.5
+val_kwargs:
+  chunk_size_seconds: 6.0
+  hop_size_seconds: 6.0
+  fs: 44100
+  npy_memmap: true
+test_kwargs:
+  npy_memmap: true
+n_channels: 2

config/data/vdbo/moisesdb-vdbo.yml ADDED Viewed

	@@ -0,0 +1,18 @@

+data_root: ${oc.env:DATA_ROOT}/moisesdb
+cls: MoisesVDBODataModule
+batch_size: 4
+effective_batch_size: null
+num_workers: 8
+train_kwargs:
+  target_length: 8192
+  chunk_size_seconds: 6.0
+  fs: 44100
+  npy_memmap: true
+val_kwargs:
+  chunk_size_seconds: 6.0
+  hop_size_seconds: 6.0
+  fs: 44100
+  npy_memmap: true
+test_kwargs:
+  npy_memmap: true
+n_channels: 2

config/losses/both_l1snr.yml ADDED Viewed

	@@ -0,0 +1,4 @@

+cls: L1SNRLoss
+modality:
+  - audio
+  - spectrogram

config/losses/both_l1snrdbm.yml ADDED Viewed

	@@ -0,0 +1,4 @@

+cls: L1SNRDecibelMatchLoss
+modality:
+  - audio
+  - spectrogram

config/models/bandit-query-pre.yml ADDED Viewed

	@@ -0,0 +1,31 @@

+cls: PasstFiLMConditionedBandit
+kwargs:
+  in_channel: 2
+  band_type: "musical"
+  n_bands: 64
+  additive_film: true
+  multiplicative_film: true
+  film_depth: 2
+  n_sqm_modules: 8
+  emb_dim: 128
+  rnn_dim: 256
+  bidirectional: true
+  rnn_type: "GRU"
+  mlp_dim: 512
+  hidden_activation: "Tanh"
+  hidden_activation_kwargs: null
+  complex_mask: true
+  use_freq_weights: true
+  n_fft: 2048
+  win_length: 2048
+  hop_length: 512
+  window_fn: "hann_window"
+  wkwargs: null
+  power: null
+  center: true
+  normalized: true
+  pad_mode: "reflect"
+  onesided: true
+  fs: 44100
+  pretrain_encoder: checkpoints/bandit-vdbo-roformer.ckpt
+  freeze_encoder: false

config/models/bandit-query-prefz.yml ADDED Viewed

	@@ -0,0 +1,31 @@

+cls: PasstFiLMConditionedBandit
+kwargs:
+  in_channel: 2
+  band_type: "musical"
+  n_bands: 64
+  additive_film: true
+  multiplicative_film: true
+  film_depth: 2
+  n_sqm_modules: 8
+  emb_dim: 128
+  rnn_dim: 256
+  bidirectional: true
+  rnn_type: "GRU"
+  mlp_dim: 512
+  hidden_activation: "Tanh"
+  hidden_activation_kwargs: null
+  complex_mask: true
+  use_freq_weights: true
+  n_fft: 2048
+  win_length: 2048
+  hop_length: 512
+  window_fn: "hann_window"
+  wkwargs: null
+  power: null
+  center: true
+  normalized: true
+  pad_mode: "reflect"
+  onesided: true
+  fs: 44100
+  pretrain_encoder: checkpoints/bandit-vdbo-roformer.ckpt
+  freeze_encoder: true

config/models/bandit-query.yml ADDED Viewed

	@@ -0,0 +1,29 @@

+cls: PasstFiLMConditionedBandit
+kwargs:
+  in_channel: 2
+  band_type: "musical"
+  n_bands: 64
+  additive_film: true
+  multiplicative_film: true
+  film_depth: 2
+  n_sqm_modules: 8
+  emb_dim: 128
+  rnn_dim: 256
+  bidirectional: true
+  rnn_type: "GRU"
+  mlp_dim: 512
+  hidden_activation: "Tanh"
+  hidden_activation_kwargs: null
+  complex_mask: true
+  use_freq_weights: true
+  n_fft: 2048
+  win_length: 2048
+  hop_length: 512
+  window_fn: "hann_window"
+  wkwargs: null
+  power: null
+  center: true
+  normalized: true
+  pad_mode: "reflect"
+  onesided: true
+  fs: 44100

config/models/bandit-vdbo.yml ADDED Viewed

	@@ -0,0 +1,27 @@

+cls: Bandit
+kwargs:
+  in_channel: 2
+  stems: ["vocals", "bass", "drums", "vdbo_others"]
+  band_type: "musical"
+  n_bands: 64
+  n_sqm_modules: 8
+  emb_dim: 128
+  rnn_dim: 256
+  bidirectional: true
+  rnn_type: "GRU"
+  mlp_dim: 512
+  hidden_activation: "Tanh"
+  hidden_activation_kwargs: null
+  complex_mask: true
+  use_freq_weights: true
+  n_fft: 2048
+  win_length: 2048
+  hop_length: 512
+  window_fn: "hann_window"
+  wkwargs: null
+  power: null
+  center: true
+  normalized: true
+  pad_mode: "reflect"
+  onesided: true
+  fs: 44100

config/optim/adam.yml ADDED Viewed

	@@ -0,0 +1,9 @@

+optimizer:
+  cls: Adam
+  kwargs:
+    lr: 1.0e-3
+scheduler:
+  cls: StepLR
+  kwargs:
+    step_size: 1
+    gamma: 0.98

config/trainer/default-long.yml ADDED Viewed

	@@ -0,0 +1,12 @@

+callbacks:
+  checkpoint:
+    monitor: val/loss
+    mode: min
+    save_top_k: 3
+    save_last: True
+max_epochs: 500
+accumulate_grad_batches: null
+gradient_clip_val: 10.0
+gradient_clip_algorithm: norm
+logger:
+  save_dir: ${oc.env:LOG_ROOT}/e2e

config/trainer/default.yml ADDED Viewed

	@@ -0,0 +1,12 @@

+callbacks:
+  checkpoint:
+    monitor: val/loss
+    mode: min
+    save_top_k: 3
+    save_last: True
+max_epochs: 150
+accumulate_grad_batches: null
+gradient_clip_val: 10.0
+gradient_clip_algorithm: norm
+logger:
+  save_dir: ${oc.env:LOG_ROOT}/e2e

core/__init__.py ADDED Viewed

File without changes

core/data/__init__.py ADDED Viewed

File without changes

core/data/base.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import inspect
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Mapping, Optional, Sequence, Union
+import numpy as np
+import torch
+import torchaudio as ta
+from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from torch.utils import data
+from pytorch_lightning import LightningDataModule
+from torch.utils.data import Dataset, DataLoader, IterableDataset
+def from_datasets(
+    train_dataset: Optional[Union[Dataset, Sequence[Dataset], Mapping[str, Dataset]]] = None,
+    val_dataset: Optional[Union[Dataset, Sequence[Dataset]]] = None,
+    test_dataset: Optional[Union[Dataset, Sequence[Dataset]]] = None,
+    predict_dataset: Optional[Union[Dataset, Sequence[Dataset]]] = None,
+    batch_size: int = 1,
+    num_workers: int = 0,
+    **datamodule_kwargs: Any,
+) -> "LightningDataModule":
+    def dataloader(ds: Dataset, shuffle: bool = False) -> DataLoader:
+        shuffle &= not isinstance(ds, IterableDataset)
+        return DataLoader(
+            ds,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            num_workers=num_workers,
+            pin_memory=True,
+            prefetch_factor=4,
+            persistent_workers=True,
+        )
+    def train_dataloader() -> TRAIN_DATALOADERS:
+        assert train_dataset
+        if isinstance(train_dataset, Mapping):
+            return {key: dataloader(ds, shuffle=True) for key, ds in train_dataset.items()}
+        if isinstance(train_dataset, Sequence):
+            return [dataloader(ds, shuffle=True) for ds in train_dataset]
+        return dataloader(train_dataset, shuffle=True)
+    def val_dataloader() -> EVAL_DATALOADERS:
+        assert val_dataset
+        if isinstance(val_dataset, Sequence):
+            return [dataloader(ds) for ds in val_dataset]
+        return dataloader(val_dataset)
+    def test_dataloader() -> EVAL_DATALOADERS:
+        assert test_dataset
+        if isinstance(test_dataset, Sequence):
+            return [dataloader(ds) for ds in test_dataset]
+        return dataloader(test_dataset)
+    def predict_dataloader() -> EVAL_DATALOADERS:
+        assert predict_dataset
+        if isinstance(predict_dataset, Sequence):
+            return [dataloader(ds) for ds in predict_dataset]
+        return dataloader(predict_dataset)
+    candidate_kwargs = {"batch_size": batch_size, "num_workers": num_workers}
+    accepted_params = inspect.signature(LightningDataModule.__init__).parameters
+    accepts_kwargs = any(param.kind == param.VAR_KEYWORD for param in accepted_params.values())
+    if accepts_kwargs:
+        special_kwargs = candidate_kwargs
+    else:
+        accepted_param_names = set(accepted_params)
+        accepted_param_names.discard("self")
+        special_kwargs = {k: v for k, v in candidate_kwargs.items() if k in accepted_param_names}
+    datamodule = LightningDataModule(**datamodule_kwargs, **special_kwargs)
+    if train_dataset is not None:
+        datamodule.train_dataloader = train_dataloader  # type: ignore[method-assign]
+    if val_dataset is not None:
+        datamodule.val_dataloader = val_dataloader  # type: ignore[method-assign]
+    if test_dataset is not None:
+        datamodule.test_dataloader = test_dataloader  # type: ignore[method-assign]
+    if predict_dataset is not None:
+        datamodule.predict_dataloader = predict_dataloader  # type: ignore[method-assign]
+    return datamodule
+class BaseSourceSeparationDataset(data.Dataset, ABC):
+    def __init__(
+        self,
+        split: str,
+        stems: List[str],
+        files: List[str],
+        data_path: str,
+        fs: int,
+        npy_memmap: bool,
+        recompute_mixture: bool,
+    ):
+        if "mixture" not in stems:
+            stems = ["mixture"] + stems
+        self.split = split
+        self.stems = stems
+        self.stems_no_mixture = [s for s in stems if s != "mixture"]
+        self.files = files
+        self.data_path = data_path
+        self.fs = fs
+        self.npy_memmap = npy_memmap
+        self.recompute_mixture = recompute_mixture
+    @abstractmethod
+    def get_stem(self, *, stem: str, identifier: Dict[str, Any]) -> torch.Tensor:
+        raise NotImplementedError
+    def _get_audio(self, stems, identifier: Dict[str, Any]):
+        audio = {}
+        for stem in stems:
+            audio[stem] = self.get_stem(stem=stem, identifier=identifier)
+        return audio
+    def get_audio(self, identifier: Dict[str, Any]):
+        if self.recompute_mixture:
+            audio = self._get_audio(self.stems_no_mixture, identifier=identifier)
+            audio["mixture"] = self.compute_mixture(audio)
+            return audio
+        else:
+            return self._get_audio(self.stems, identifier=identifier)
+    @abstractmethod
+    def get_identifier(self, index: int) -> Dict[str, Any]:
+        pass
+    def compute_mixture(self, audio) -> torch.Tensor:
+        return sum(audio[stem] for stem in audio if stem != "mixture")

core/data/moisesdb/__init__.py ADDED Viewed

	@@ -0,0 +1,97 @@

+taxonomy = {
+    "vocals": [
+        "lead male singer",
+        "lead female singer",
+        "human choir",
+        "background vocals",
+        "other (vocoder, beatboxing etc)",
+    ],
+    "bass": [
+        "bass guitar",
+        "bass synthesizer (moog etc)",
+        "contrabass/double bass (bass of instrings)",
+        "tuba (bass of brass)",
+        "bassoon (bass of woodwind)",
+    ],
+    "drums": [
+        "snare drum",
+        "toms",
+        "kick drum",
+        "cymbals",
+        "overheads",
+        "full acoustic drumkit",
+        "drum machine",
+    ],
+    "other": [
+        "fx/processed sound, scratches, gun shots, explosions etc",
+        "click track",
+    ],
+    "guitar": [
+        "clean electric guitar",
+        "distorted electric guitar",
+        "lap steel guitar or slide guitar",
+        "acoustic guitar",
+    ],
+    "other plucked": ["banjo, mandolin, ukulele, harp etc"],
+    "percussion": [
+        "a-tonal percussion (claps, shakers, congas, cowbell etc)",
+        "pitched percussion (mallets, glockenspiel, ...)",
+    ],
+    "piano": [
+        "grand piano",
+        "electric piano (rhodes, wurlitzer, piano sound alike)",
+    ],
+    "other keys": [
+        "organ, electric organ",
+        "synth pad",
+        "synth lead",
+        "other sounds (hapischord, melotron etc)",
+    ],
+    "bowed strings": [
+        "violin (solo)",
+        "viola (solo)",
+        "cello (solo)",
+        "violin section",
+        "viola section",
+        "cello section",
+        "string section",
+        "other strings",
+    ],
+    "wind": [
+        "brass (trumpet, trombone, french horn, brass etc)",
+        "flutes (piccolo, bamboo flute, panpipes, flutes etc)",
+        "reeds (saxophone, clarinets, oboe, english horn, bagpipe)",
+        "other wind",
+    ],
+}
+def clean_track_inst(inst):
+    if "fx" in inst:
+        inst = "fx"
+    if "contrabass_double_bass" in inst:
+        inst = "double_bass"
+    if "banjo" in inst:
+        return "other_plucked"
+    if "(" in inst:
+        inst = inst.split("(")[0]
+    for s in [",", "-"]:
+        if s in inst:
+            inst = inst.replace(s, "")
+    for s in ["/"]:
+        if s in inst:
+            inst = inst.replace(s, "_")
+    if inst[-1] == "_":
+        inst = inst[:-1]
+    return inst
+taxonomy = {k: [clean_track_inst(i.replace(" ", "_")) for i in v] for k, v in taxonomy.items()}

core/data/moisesdb/audio.ipynb ADDED Viewed

	@@ -0,0 +1,76 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-02-17T00:59:48.228125593Z",
+     "start_time": "2024-02-17T00:59:47.533488738Z"
+    },
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import IPython\n",
+    "file = \"/home/kwatchar3/Documents/data/moisesdb/npy/e2ccbc17-44bf-431a-af2b-4cf2fbd19a72/mixture.npy\"\n",
+    "\n",
+    "audio = np.load(file)\n",
+    "\n",
+    "IPython.display.Audio(audio, rate=44100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-02-16T18:01:10.487779628Z",
+     "start_time": "2024-02-16T18:01:06.898408871Z"
+    },
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from scipy.signal import spectrogram\n",
+    "\n",
+    "f, t, Sxx = spectrogram(audio, 44100, nperseg=1024, noverlap=512)\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "plt.pcolormesh(t, f, 10 * np.log10(Sxx[0]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}

core/data/moisesdb/datamodule.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import os.path
+from typing import Mapping, Optional
+import pytorch_lightning as pl
+from core.data.base import from_datasets
+from core.data.moisesdb.dataset import MoisesDBRandomChunkBalancedRandomQueryDataset, MoisesDBRandomChunkRandomQueryDataset, \
+    MoisesDBDeterministicChunkDeterministicQueryDataset, \
+    MoisesDBFullTrackDataset, MoisesDBVDBODeterministicChunkDataset, \
+    MoisesDBVDBOFullTrackDataset, MoisesDBVDBORandomChunkDataset, \
+    MoisesDBFullTrackTestQueryDataset
+def MoisesDataModule(
+    data_root: str,
+    batch_size: int,
+    num_workers: int = 8,
+    train_kwargs: Optional[Mapping] = None,
+    val_kwargs: Optional[Mapping] = None,
+    test_kwargs: Optional[Mapping] = None,
+    datamodule_kwargs: Optional[Mapping] = None,
+) -> pl.LightningDataModule:
+    if train_kwargs is None:
+        train_kwargs = {}
+    if val_kwargs is None:
+        val_kwargs = {}
+    if test_kwargs is None:
+        test_kwargs = {}
+    if datamodule_kwargs is None:
+        datamodule_kwargs = {}
+    train_dataset = MoisesDBRandomChunkRandomQueryDataset(
+        data_root=data_root, split="train", **train_kwargs
+    )
+    val_dataset = MoisesDBDeterministicChunkDeterministicQueryDataset(
+        data_root=data_root, split="val", **val_kwargs
+    )
+    test_dataset = MoisesDBDeterministicChunkDeterministicQueryDataset(
+        data_root=data_root, split="test", **test_kwargs
+    )
+    datamodule = from_datasets(
+        train_dataset=train_dataset,
+        val_dataset=val_dataset,
+        test_dataset=test_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        **datamodule_kwargs
+    )
+    datamodule.predict_dataloader = (  # type: ignore[method-assign]
+        datamodule.test_dataloader
+    )
+    return datamodule
+def MoisesBalancedTrainDataModule(
+    data_root: str,
+    batch_size: int,
+    num_workers: int = 8,
+    train_kwargs: Optional[Mapping] = None,
+    val_kwargs: Optional[Mapping] = None,
+    test_kwargs: Optional[Mapping] = None,
+    datamodule_kwargs: Optional[Mapping] = None,
+) -> pl.LightningDataModule:
+    if train_kwargs is None:
+        train_kwargs = {}
+    if val_kwargs is None:
+        val_kwargs = {}
+    if test_kwargs is None:
+        test_kwargs = {}
+    if datamodule_kwargs is None:
+        datamodule_kwargs = {}
+    train_dataset = MoisesDBRandomChunkBalancedRandomQueryDataset(
+        data_root=data_root, split="train", **train_kwargs
+    )
+    val_dataset = MoisesDBDeterministicChunkDeterministicQueryDataset(
+        data_root=data_root, split="val", **val_kwargs
+    )
+    test_dataset = MoisesDBDeterministicChunkDeterministicQueryDataset(
+        data_root=data_root, split="test", **test_kwargs
+    )
+    datamodule = from_datasets(
+        train_dataset=train_dataset,
+        val_dataset=val_dataset,
+        test_dataset=test_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        **datamodule_kwargs
+    )
+    datamodule.predict_dataloader = (  # type: ignore[method-assign]
+        datamodule.test_dataloader
+    )
+    return datamodule
+def MoisesValidationDataModule(
+    data_root: str,
+    batch_size: int,
+    num_workers: int = 8,
+    val_kwargs: Optional[Mapping] = None,
+    datamodule_kwargs: Optional[Mapping] = None,
+    **kwargs
+) -> pl.LightningDataModule:
+    if val_kwargs is None:
+        val_kwargs = {}
+    if datamodule_kwargs is None:
+        datamodule_kwargs = {}
+    allowed_stems = val_kwargs.get("allowed_stems", None)
+    assert allowed_stems is not None, "allowed_stems must be provided"
+    val_datasets = []
+    for allowed_stem in allowed_stems:
+        kwargs = val_kwargs.copy()
+        kwargs["allowed_stems"] = [allowed_stem]
+        val_dataset = MoisesDBDeterministicChunkDeterministicQueryDataset(
+            data_root=data_root, split="val",
+            **kwargs
+        )
+        val_datasets.append(val_dataset)
+    datamodule = from_datasets(
+        val_dataset=val_datasets,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        **datamodule_kwargs
+    )
+    datamodule.predict_dataloader = (  # type: ignore[method-assign]
+        datamodule.val_dataloader
+    )
+    return datamodule
+def MoisesTestDataModule(
+    data_root: str,
+    batch_size: int = 1,
+    num_workers: int = 8,
+    test_kwargs: Optional[Mapping] = None,
+    datamodule_kwargs: Optional[Mapping] = None,
+    **kwargs
+) -> pl.LightningDataModule:
+    if test_kwargs is None:
+        test_kwargs = {}
+    if datamodule_kwargs is None:
+        datamodule_kwargs = {}
+    allowed_stems = test_kwargs.get("allowed_stems", None)
+    assert allowed_stems is not None, "allowed_stems must be provided"
+    test_dataset = MoisesDBFullTrackTestQueryDataset(
+        data_root=data_root, split="test",
+        **test_kwargs
+    )
+    datamodule = from_datasets(
+        test_dataset=test_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        **datamodule_kwargs
+    )
+    datamodule.predict_dataloader = (  # type: ignore[method-assign]
+        datamodule.test_dataloader
+    )
+    return datamodule
+def MoisesVDBODataModule(
+    data_root: str,
+    batch_size: int,
+    num_workers: int = 8,
+    train_kwargs: Optional[Mapping] = None,
+    val_kwargs: Optional[Mapping] = None,
+    test_kwargs: Optional[Mapping] = None,
+    datamodule_kwargs: Optional[Mapping] = None,
+):
+    if train_kwargs is None:
+        train_kwargs = {}
+    if val_kwargs is None:
+        val_kwargs = {}
+    if test_kwargs is None:
+        test_kwargs = {}
+    if datamodule_kwargs is None:
+        datamodule_kwargs = {}
+    train_dataset = MoisesDBVDBORandomChunkDataset(
+        data_root=data_root, split="train", **train_kwargs
+    )
+    val_dataset = MoisesDBVDBODeterministicChunkDataset(
+        data_root=data_root, split="val", **val_kwargs
+    )
+    test_dataset = MoisesDBVDBOFullTrackDataset(
+        data_root=data_root, split="test", **test_kwargs
+    )
+    predict_dataset = test_dataset
+    datamodule = from_datasets(
+        train_dataset=train_dataset,
+        val_dataset=val_dataset,
+        test_dataset=test_dataset,
+        predict_dataset=predict_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        **datamodule_kwargs
+    )
+    return datamodule

core/data/moisesdb/dataset.py ADDED Viewed

	@@ -0,0 +1,1383 @@

+import math
+import os
+import random
+import warnings
+from abc import ABC
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Tuple, Union
+import numpy as np
+from omegaconf import OmegaConf
+import pandas as pd
+import torch
+from torch_audiomentations.utils.object_dict import ObjectDict
+import torchaudio as ta
+from torch.utils import data
+from tqdm import tqdm
+from core.data.base import BaseSourceSeparationDataset
+from core.types import input_dict
+from . import clean_track_inst
+from torch import Tensor, nn
+DBFS_HOP_SIZE = int(0.125 * 44100)
+DBFS_CHUNK_SIZE = int(1 * 44100)
+INST_BY_OCCURRENCE = [
+    "bass_guitar",
+    "kick_drum",
+    "snare_drum",
+    "lead_male_singer",
+    "distorted_electric_guitar",
+    "clean_electric_guitar",
+    "toms",
+    "acoustic_guitar",
+    "background_vocals",
+    "hi_hat",
+    "overheads",
+    "atonal_percussion",
+    "grand_piano",
+    "cymbals",
+    "lead_female_singer",
+    "synth_lead",
+    "bass_synthesizer",
+    "synth_pad",
+    "organ_electric_organ",
+    "fx",
+    "drum_machine",
+    "string_section",
+    "electric_piano",
+    "full_acoustic_drumkit",
+    "other_sounds",
+    "pitched_percussion",
+    "brass",
+    "reeds",
+    "contrabass_double_bass",
+    "other_plucked",
+    "other_strings",
+    "other_wind",
+    "cello",
+    "other",
+    "flutes",
+    "viola_section",
+    "viola",
+    "cello_section",
+]
+FINE_LEVEL_INSTRUMENTS = {
+    "lead_male_singer",
+    "lead_female_singer",
+    "human_choir",
+    "background_vocals",
+    "other_vocals",
+    "bass_guitar",
+    "bass_synthesizer",
+    "contrabass_double_bass",
+    "tuba",
+    "bassoon",
+    "snare_drum",
+    "toms",
+    "kick_drum",
+    "cymbals",
+    "overheads",
+    "full_acoustic_drumkit",
+    "drum_machine",
+    "hihat",
+    "fx",
+    "click_track",
+    "clean_electric_guitar",
+    "distorted_electric_guitar",
+    "lap_steel_guitar_or_slide_guitar",
+    "acoustic_guitar",
+    "other_plucked",
+    "atonal_percussion",
+    "pitched_percussion",
+    "grand_piano",
+    "electric_piano",
+    "organ_electric_organ",
+    "synth_pad",
+    "synth_lead",
+    "other_sounds",
+    "violin",
+    "viola",
+    "cello",
+    "violin_section",
+    "viola_section",
+    "cello_section",
+    "string_section",
+    "other_strings",
+    "brass",
+    "flutes",
+    "reeds",
+    "other_wind",
+}
+COARSE_LEVEL_INSTRUMENTS = {
+    "vocals",
+    "bass",
+    "drums",
+    "guitar",
+    "other_plucked",
+    "percussion",
+    "piano",
+    "other_keys",
+    "bowed_strings",
+    "wind",
+    "other",
+}
+COARSE_TO_FINE = {
+    "vocals": [
+        "lead_male_singer",
+        "lead_female_singer",
+        "human_choir",
+        "background_vocals",
+        "other_vocals",
+    ],
+    "bass": [
+        "bass_guitar",
+        "bass_synthesizer",
+        "contrabass_double_bass",
+        "tuba",
+        "bassoon",
+    ],
+    "drums": [
+        "snare_drum",
+        "toms",
+        "kick_drum",
+        "cymbals",
+        "overheads",
+        "full_acoustic_drumkit",
+        "drum_machine",
+        "hihat",
+    ],
+    "other": ["fx", "click_track"],
+    "guitar": [
+        "clean_electric_guitar",
+        "distorted_electric_guitar",
+        "lap_steel_guitar_or_slide_guitar",
+        "acoustic_guitar",
+    ],
+    "other_plucked": ["other_plucked"],
+    "percussion": ["atonal_percussion", "pitched_percussion"],
+    "piano": ["grand_piano", "electric_piano"],
+    "other_keys": ["organ_electric_organ", "synth_pad", "synth_lead", "other_sounds"],
+    "bowed_strings": [
+        "violin",
+        "viola",
+        "cello",
+        "violin_section",
+        "viola_section",
+        "cello_section",
+        "string_section",
+        "other_strings",
+    ],
+    "wind": ["brass", "flutes", "reeds", "other_wind"],
+}
+COARSE_TO_FINE = {k: set(v) for k, v in COARSE_TO_FINE.items()}
+FINE_TO_COARSE = {k: kk for kk, v in COARSE_TO_FINE.items() for k in v}
+ALL_LEVEL_INSTRUMENTS = COARSE_LEVEL_INSTRUMENTS.union(FINE_LEVEL_INSTRUMENTS)
+class MoisesDBBaseDataset(BaseSourceSeparationDataset, ABC):
+    def __init__(
+        self,
+        split: str,
+        data_path: str = "/home/kwatchar3/Documents/data/moisesdb",
+        fs: int = 44100,
+        return_stems: Union[bool, List[str]] = False,
+        npy_memmap=True,
+        recompute_mixture=False,
+        train_folds=None,
+        val_folds=None,
+        test_folds=None,
+        query_file="query",
+    ) -> None:
+        if test_folds is None:
+            test_folds = [5]
+        if val_folds is None:
+            val_folds = [4]
+        if train_folds is None:
+            train_folds = [1, 2, 3]
+        split_path = os.path.join(data_path, "splits.csv")
+        splits = pd.read_csv(split_path)
+        metadata_path = os.path.join(data_path, "stems.csv")
+        metadata = pd.read_csv(metadata_path)
+        if split == "train":
+            folds = train_folds
+        elif split == "val":
+            folds = val_folds
+        elif split == "test":
+            folds = test_folds
+        else:
+            raise NameError
+        files = splits[splits["split"].isin(folds)]["song_id"].tolist()
+        metadata = metadata[metadata["song_id"].isin(files)]
+        super().__init__(
+            split=split,
+            stems=["mixture"],
+            files=files,
+            data_path=data_path,
+            fs=fs,
+            npy_memmap=npy_memmap,
+            recompute_mixture=recompute_mixture,
+        )
+        self.folds = folds
+        self.metadata = metadata.rename(
+            columns={k: k.replace(" ", "_") for k in metadata.columns}
+        )
+        self.song_to_stem = (
+            metadata.set_index("song_id")
+            .apply(lambda row: row[row == 1].index.tolist(), axis=1)
+            .to_dict()
+        )
+        self.stem_to_song = (
+            metadata.set_index("song_id")
+            .transpose()
+            .apply(lambda row: row[row == 1].index.tolist(), axis=1)
+            .to_dict()
+        )
+        self.true_length = len(self.files)
+        self.n_channels = 2
+        self.audio_path = os.path.join(data_path, "npy2")
+        self.return_stems = return_stems
+        self.query_file = query_file
+    def get_full_stem(self, *, stem: str, identifier) -> torch.Tensor:
+        song_id = identifier["song_id"]
+        path = os.path.join(self.data_path, "npy2", song_id)
+        # noinspection PyUnresolvedReferences
+        assert self.npy_memmap
+        if os.path.exists(os.path.join(path, f"{stem}.npy")):
+            audio = np.load(os.path.join(path, f"{stem}.npy"), mmap_mode="r")
+        else:
+            audio = None
+        return audio
+    def get_query_stem(self, *, stem: str, identifier) -> torch.Tensor:
+        song_id = identifier["song_id"]
+        path = os.path.join(self.data_path, "npyq", song_id)
+        # noinspection PyUnresolvedReferences
+        if self.npy_memmap:
+            # print(self.npy_memmap)
+            audio = np.load(
+                os.path.join(path, f"{stem}.{self.query_file}.npy"), mmap_mode="r"
+            )
+        else:
+            raise NotImplementedError
+        return audio
+    def get_stem(self, *, stem: str, identifier) -> torch.Tensor:
+        audio = self.get_full_stem(stem=stem, identifier=identifier)
+        return audio
+    def get_identifier(self, index):
+        return dict(song_id=self.files[index % self.true_length])
+    def __getitem__(self, index: int):
+        identifier = self.get_identifier(index)
+        audio = self.get_audio(identifier)
+        mixture = audio["mixture"].copy()
+        if isinstance(self.return_stems, list):
+            sources = {
+                stem: audio.get(stem, np.zeros_like(mixture))
+                for stem in self.return_stems
+            }
+        elif isinstance(self.return_stems, bool):
+            if self.return_stems:
+                sources = {
+                    stem: audio[stem].copy()
+                    for stem in self.song_to_stem[identifier["song_id"]]
+                }
+            else:
+                sources = None
+        else:
+            raise ValueError
+        return input_dict(
+            mixture=mixture,
+            sources=sources,
+            metadata=identifier,
+            modality="audio",
+        )
+class MoisesDBFullTrackDataset(MoisesDBBaseDataset):
+    def __init__(
+        self,
+        data_root: str,
+        split: str,
+        return_stems: Union[bool, List[str]] = False,
+        npy_memmap=True,
+        recompute_mixture=False,
+        query_file="query",
+    ) -> None:
+        super().__init__(
+            split=split,
+            data_path=data_root,
+            return_stems=return_stems,
+            npy_memmap=npy_memmap,
+            recompute_mixture=recompute_mixture,
+            query_file=query_file,
+        )
+    def __len__(self) -> int:
+        return self.true_length
+class MoisesDBVDBOFullTrackDataset(MoisesDBFullTrackDataset):
+    def __init__(
+        self, data_root: str, split: str, npy_memmap=True, recompute_mixture=False
+    ) -> None:
+        super().__init__(
+            data_root=data_root,
+            split=split,
+            return_stems=["vocals", "bass", "drums", "vdbo_others"],
+            npy_memmap=npy_memmap,
+            recompute_mixture=recompute_mixture,
+            query_file=None,
+        )
+import torch_audiomentations as audiomentations
+from torch_audiomentations.utils.dsp import convert_decibels_to_amplitude_ratio
+class SmartGain(audiomentations.Gain):
+    def __init__(
+        self, p=0.5, min_gain_in_db=-6, max_gain_in_db=6, dbfs_threshold=-45.0
+    ):
+        super().__init__(
+            p=p, min_gain_in_db=min_gain_in_db, max_gain_in_db=max_gain_in_db
+        )
+        self.dbfs_threshold = dbfs_threshold
+    def randomize_parameters(
+        self,
+        samples: Tensor = None,
+        sample_rate: Optional[int] = None,
+        targets: Optional[Tensor] = None,
+        target_rate: Optional[int] = None,
+    ):
+        dbfs = 10 * torch.log10(torch.mean(torch.square(samples)) + 1e-6)
+        if dbfs > self.dbfs_threshold:
+            low = self.min_gain_in_db
+        else:
+            low = max(0.0, self.min_gain_in_db)
+        distribution = torch.distributions.Uniform(
+            low=torch.tensor(low, dtype=torch.float32, device=samples.device),
+            high=torch.tensor(
+                self.max_gain_in_db, dtype=torch.float32, device=samples.device
+            ),
+            validate_args=True,
+        )
+        selected_batch_size = samples.size(0)
+        self.transform_parameters["gain_factors"] = (
+            convert_decibels_to_amplitude_ratio(
+                distribution.sample(sample_shape=(selected_batch_size,))
+            )
+            .unsqueeze(1)
+            .unsqueeze(1)
+        )
+class Audiomentations(audiomentations.Compose):
+    def __init__(self, augment="gssp", fs: int = 44100):
+        if isinstance(augment, str):
+            if augment == "gssp":
+                augment = OmegaConf.create(
+                    [
+                        dict(
+                            cls="Shift",
+                            kwargs=dict(p=1.0, min_shift=-0.5, max_shift=0.5),
+                        ),
+                        dict(
+                            cls="Gain",
+                            kwargs=dict(p=1.0, min_gain_in_db=-6, max_gain_in_db=6),
+                        ),
+                        dict(cls="ShuffleChannels", kwargs=dict(p=0.5)),
+                        dict(cls="PolarityInversion", kwargs=dict(p=0.5)),
+                    ]
+                )
+            else:
+                raise ValueError
+        transforms = []
+        for transform in augment:
+            if transform.cls == "Gain":
+                transforms.append(SmartGain(**transform.kwargs))
+            else:
+                transforms.append(
+                    getattr(audiomentations, transform.cls)(**transform.kwargs)
+                )
+        super().__init__(transforms=transforms, shuffle=True)
+        self.fs = fs
+    def forward(
+        self,
+        samples: torch.Tensor = None,
+    ) -> ObjectDict:
+        return super().forward(samples, sample_rate=self.fs)
+class MoisesDBVDBORandomChunkDataset(MoisesDBVDBOFullTrackDataset):
+    def __init__(
+        self,
+        data_root: str,
+        split: str,
+        chunk_size_seconds: float = 4.0,
+        fs: int = 44100,
+        target_length: int = 8192,
+        augment=None,
+        npy_memmap=True,
+        recompute_mixture=True,
+        db_threshold=-24.0,
+        db_step=-12.0,
+    ) -> None:
+        super().__init__(
+            data_root=data_root,
+            split=split,
+            npy_memmap=npy_memmap,
+            recompute_mixture=recompute_mixture,
+        )
+        self.chunk_size_seconds = chunk_size_seconds
+        self.chunk_size_samples = int(chunk_size_seconds * fs)
+        self.fs = fs
+        self.target_length = target_length
+        self.db_threshold = db_threshold
+        self.db_step = db_step
+        if augment is not None:
+            assert self.recompute_mixture
+            self.augment = Audiomentations(augment, fs)
+        else:
+            self.augment = None
+    def __len__(self) -> int:
+        return self.target_length
+    def _chunk_audio(self, audio, start, end):
+        audio = {k: v[..., start:end] for k, v in audio.items()}
+        return audio
+    def _get_start_end(self, audio, identifier):
+        n_samples = audio.shape[-1]
+        start = np.random.randint(0, n_samples - self.chunk_size_samples)
+        end = start + self.chunk_size_samples
+        return start, end
+    def _get_audio(self, stems, identifier: Dict[str, Any]):
+        audio = {}
+        for stem in stems:
+            audio[stem] = self.get_full_stem(stem=stem, identifier=identifier)
+        for stem in stems:
+            if audio[stem] is None:
+                audio[stem] = np.zeros(
+                    audio[
+                        (
+                            "mixture"
+                            if "mixture" in stems
+                            else [s for s in stems if audio[s] is not None][0]
+                        )
+                    ].shape,
+                    dtype=np.float32,
+                )
+        start, end = self._get_start_end(audio[stems[0]], identifier)
+        audio = self._chunk_audio(audio, start, end)
+        if self.augment is not None:
+            audio = {
+                k: self.augment(torch.from_numpy(v[None, :, :]))[0, :, :].numpy()
+                for k, v in audio.items()
+            }
+        return audio
+    def get_audio(self, identifier: Dict[str, Any]):
+        if self.recompute_mixture:
+            audio = self._get_audio(
+                ["vocals", "bass", "drums", "vdbo_others"], identifier=identifier
+            )
+            audio["mixture"] = self.compute_mixture(audio)
+            return audio
+        else:
+            return self._get_audio(
+                ["mixture", "vocals", "bass", "drums", "vdbo_others"],
+                identifier=identifier,
+            )
+    def __getitem__(self, index: int):
+        identifier = self.get_identifier(index)
+        audio = self.get_audio(identifier=identifier)
+        mixture = audio["mixture"].copy()
+        sources = {
+            stem: audio.get(stem, np.zeros_like(mixture)) for stem in self.return_stems
+        }
+        return input_dict(
+            mixture=mixture,
+            sources=sources,
+            metadata=identifier,
+            modality="audio",
+        )
+class MoisesDBVDBODeterministicChunkDataset(MoisesDBVDBORandomChunkDataset):
+    def __init__(
+        self,
+        data_root: str,
+        split: str,
+        chunk_size_seconds: float = 4.0,
+        hop_size_seconds: float = 8.0,
+        fs: int = 44100,
+        npy_memmap=True,
+        recompute_mixture=False,
+    ) -> None:
+        super().__init__(
+            data_root=data_root,
+            split=split,
+            chunk_size_seconds=chunk_size_seconds,
+            npy_memmap=npy_memmap,
+            recompute_mixture=recompute_mixture,
+        )
+        self.hop_size_seconds = hop_size_seconds
+        self.hop_size_samples = int(hop_size_seconds * fs)
+        self.index_to_identifiers = self._generate_index()
+        self.length = len(self.index_to_identifiers)
+    def __len__(self) -> int:
+        return self.length
+    def _generate_index(self):
+        identifiers = []
+        for song_id in self.files:
+            audio = self.get_full_stem(stem="mixture", identifier=dict(song_id=song_id))
+            n_samples = audio.shape[-1]
+            n_chunks = math.floor(
+                (n_samples - self.chunk_size_samples) / self.hop_size_samples
+            )
+            for i in range(n_chunks):
+                chunk_start = i * self.hop_size_samples
+                identifiers.append(dict(song_id=song_id, chunk_start=chunk_start))
+        return identifiers
+    def get_identifier(self, index):
+        return self.index_to_identifiers[index]
+    def _get_start_end(self, audio, identifier):
+        start = identifier["chunk_start"]
+        end = start + self.chunk_size_samples
+        return start, end
+def round_samples(seconds, fs, hop_size, downsample):
+    n_frames = math.ceil(seconds * fs / hop_size) + 1
+    n_frames_down = math.ceil(n_frames / downsample)
+    n_frames = n_frames_down * downsample
+    n_samples = (n_frames - 1) * hop_size
+    return int(n_samples)
+class MoisesDBRandomChunkRandomQueryDataset(MoisesDBFullTrackDataset):
+    def __init__(
+        self,
+        data_root: str,
+        split: str,
+        target_length: int,
+        chunk_size_seconds: float = 4.0,
+        query_size_seconds: float = 1.0,
+        round_query: bool = False,
+        min_query_dbfs: float = -40.0,
+        min_target_dbfs: float = -36.0,
+        min_target_dbfs_step: float = -12.0,
+        max_dbfs_tries: int = 10,
+        top_k_instrument: int = 10,
+        mixture_stem: str = "mixture",
+        use_own_query: bool = True,
+        npy_memmap=True,
+        allowed_stems=None,
+        query_file="query",
+        augment=None,
+    ) -> None:
+        super().__init__(
+            data_root=data_root,
+            split=split,
+            npy_memmap=npy_memmap,
+            recompute_mixture=augment is not None,
+            query_file=query_file,
+        )
+        self.mixture_stem = mixture_stem
+        self.chunk_size_seconds = chunk_size_seconds
+        self.chunk_size_samples = round_samples(
+            self.chunk_size_seconds, self.fs, 512, 2**6
+        )
+        self.query_size_seconds = query_size_seconds
+        if round_query:
+            self.query_size_samples = round_samples(
+                self.query_size_seconds, self.fs, 512, 2**6
+            )
+        else:
+            self.query_size_samples = int(self.query_size_seconds * self.fs)
+        self.target_length = target_length
+        self.min_query_dbfs = min_query_dbfs
+        if min_target_dbfs is None:
+            min_target_dbfs = -np.inf
+            min_target_dbfs_step = None
+            max_dbfs_tries = 1
+        self.min_target_dbfs = min_target_dbfs
+        self.min_target_dbfs_step = min_target_dbfs_step
+        self.max_dbfs_tries = max_dbfs_tries
+        self.top_k_instrument = top_k_instrument
+        if allowed_stems is None:
+            allowed_stems = INST_BY_OCCURRENCE[: self.top_k_instrument]
+        else:
+            self.top_k_instrument = None
+        self.allowed_stems = allowed_stems
+        self.song_to_all_stems = {
+            k: list(set(v) & set(ALL_LEVEL_INSTRUMENTS))
+            for k, v in self.song_to_stem.items()
+        }
+        self.song_to_stem = {
+            k: list(set(v) & set(self.allowed_stems))
+            for k, v in self.song_to_stem.items()
+        }
+        self.stem_to_song = {
+            k: list(set(v) & set(self.files)) for k, v in self.stem_to_song.items()
+        }
+        self.queriable_songs = [k for k, v in self.song_to_stem.items() if len(v) > 0]
+        self.use_own_query = use_own_query
+        if self.use_own_query:
+            self.files = [k for k in self.files if len(self.song_to_stem[k]) > 0]
+            self.true_length = len(self.files)
+        if augment is not None:
+            assert self.recompute_mixture
+            self.augment = Audiomentations(augment, self.fs)
+        else:
+            self.augment = None
+    def __len__(self) -> int:
+        return self.target_length
+    def _chunk_audio(self, audio, start, end):
+        audio = {k: v[..., start:end] for k, v in audio.items()}
+        return audio
+    def _get_start_end(self, audio):
+        n_samples = audio.shape[-1]
+        start = np.random.randint(0, n_samples - self.chunk_size_samples)
+        end = start + self.chunk_size_samples
+        return start, end
+    def _target_dbfs(self, audio):
+        return 10.0 * np.log10(np.mean(np.square(np.abs(audio))) + 1e-6)
+    def _chunk_and_check_dbfs_threshold(self, audio_, target_stem, threshold):
+        target_dict = {target_stem: audio_[target_stem]}
+        for _ in range(self.max_dbfs_tries):
+            start, end = self._get_start_end(audio_[target_stem])
+            taudio = self._chunk_audio(target_dict, start, end)
+            dbfs = self._target_dbfs(taudio[target_stem])
+            if dbfs > threshold:
+                return self._chunk_audio(audio_, start, end)
+        return None
+    def _chunk_and_check_dbfs(self, audio_, target_stem):
+        out = self._chunk_and_check_dbfs_threshold(
+            audio_, target_stem, self.min_target_dbfs
+        )
+        if out is not None:
+            return out
+        out = self._chunk_and_check_dbfs_threshold(
+            audio_, target_stem, self.min_target_dbfs + self.min_target_dbfs_step
+        )
+        if out is not None:
+            return out
+        start, end = self._get_start_end(audio_[target_stem])
+        audio = self._chunk_audio(audio_, start, end)
+        return audio
+    def _augment(self, audio, target_stem):
+        stack_audio = np.stack([v for v in audio.values()], axis=0)
+        aug_audio = self.augment(torch.from_numpy(stack_audio)).numpy()
+        mixture = np.sum(aug_audio, axis=0)
+        out = {
+            "mixture": mixture,
+        }
+        if target_stem is not None:
+            target_idx = list(audio.keys()).index(target_stem)
+            out[target_stem] = aug_audio[target_idx]
+        return out
+    def _choose_stems_for_augment(self, identifier, target_stem):
+        stems_for_song = set(self.song_to_all_stems[identifier["song_id"]])
+        stems_ = []
+        coarse_level_accounted = set()
+        is_none_target = target_stem is None
+        is_coarse_target = target_stem in COARSE_LEVEL_INSTRUMENTS
+        if is_coarse_target or is_none_target:
+            coarse_target = target_stem
+        else:
+            coarse_target = FINE_TO_COARSE[target_stem]
+        fine_level_stems = stems_for_song & FINE_LEVEL_INSTRUMENTS
+        coarse_level_stems = stems_for_song & COARSE_LEVEL_INSTRUMENTS
+        for s in fine_level_stems:
+            coarse_level = FINE_TO_COARSE[s]
+            if is_coarse_target and coarse_level == coarse_target:
+                continue
+            else:
+                stems_.append(s)
+            coarse_level_accounted.add(coarse_level)
+        stems_ += list(coarse_level_stems - coarse_level_accounted)
+        if target_stem is not None:
+            assert target_stem in stems_, f"stems: {stems_}, target stem: {target_stem}"
+            if len(stems_for_song) > 1:
+                assert (
+                    len(stems_) > 1
+                ), f"stems: {stems_}, stems in song: {stems_for_song},\n target stem: {target_stem}"
+        assert "mixture" not in stems_
+        return stems_
+    def _get_audio(
+        self, stems, identifier: Dict[str, Any], check_dbfs=True, no_target=False
+    ):
+        target_stem = stems[0] if not no_target else None
+        if self.augment is not None:
+            stems_ = self._choose_stems_for_augment(identifier, target_stem)
+        else:
+            stems_ = stems
+        audio = {}
+        for stem in stems_:
+            audio[stem] = self.get_full_stem(stem=stem, identifier=identifier)
+        audio_ = {k: v.copy() for k, v in audio.items()}
+        if check_dbfs:
+            assert target_stem is not None
+            audio = self._chunk_and_check_dbfs(audio_, target_stem)
+        else:
+            first_key = list(audio_.keys())[0]
+            start, end = self._get_start_end(audio_[first_key])
+            audio = self._chunk_audio(audio_, start, end)
+        if self.augment is not None:
+            assert "mixture" not in audio
+            audio = self._augment(audio, target_stem)
+            assert "mixture" in audio
+        return audio
+    def __getitem__(self, index: int):
+        mix_identifier = self.get_identifier(index)
+        mix_stems = self.song_to_stem[mix_identifier["song_id"]]
+        if self.use_own_query:
+            query_id = mix_identifier["song_id"]
+            query_identifier = dict(song_id=query_id)
+            possible_stem = mix_stems
+            assert len(possible_stem) > 0
+            zero_target = False
+        else:
+            query_id = random.choice(self.queriable_songs)
+            query_identifier = dict(song_id=query_id)
+            query_stems = self.song_to_stem[query_id]
+            possible_stem = list(set(mix_stems) & set(query_stems))
+            if len(possible_stem) == 0:
+                possible_stem = query_stems
+                zero_target = True
+                # print(f"Mix {mix_identifier['song_id']} and query {query_id} have no common stems.")
+                # return self.__getitem__(index + 1)
+            else:
+                zero_target = False
+        assert (
+            len(possible_stem) > 0
+        ), f"{mix_identifier['song_id']} and {query_id} have no common stems. zero target is {zero_target}"
+        stem = random.choice(possible_stem)
+        if zero_target:
+            audio = self._get_audio(
+                [self.mixture_stem],
+                identifier=mix_identifier,
+                check_dbfs=False,
+                no_target=True,
+            )
+            mixture = audio[self.mixture_stem].copy()
+            sources = {"target": np.zeros_like(mixture)}
+        else:
+            audio = self._get_audio(
+                [stem, self.mixture_stem], identifier=mix_identifier, check_dbfs=True
+            )
+            mixture = audio[self.mixture_stem].copy()
+            sources = {"target": audio[stem].copy()}
+        query = self.get_query_stem(stem=stem, identifier=query_identifier)
+        query = query.copy()
+        assert mixture.shape[-1] == self.chunk_size_samples
+        assert query.shape[-1] == self.query_size_samples
+        assert sources["target"].shape[-1] == self.chunk_size_samples
+        return input_dict(
+            mixture=mixture,
+            sources=sources,
+            query=query,
+            metadata={
+                "mix": mix_identifier,
+                "query": query_identifier,
+                "stem": stem,
+            },
+            modality="audio",
+        )
+class MoisesDBRandomChunkBalancedRandomQueryDataset(
+    MoisesDBRandomChunkRandomQueryDataset
+):
+    def __init__(
+        self,
+        data_root: str,
+        split: str,
+        target_length: int,
+        chunk_size_seconds: float = 4,
+        query_size_seconds: float = 1,
+        round_query: bool = False,
+        min_query_dbfs: float = -40.0,
+        min_target_dbfs: float = -36.0,
+        min_target_dbfs_step: float = -12.0,
+        max_dbfs_tries: int = 10,
+        top_k_instrument: int = 10,
+        mixture_stem: str = "mixture",
+        use_own_query: bool = True,
+        npy_memmap=True,
+        allowed_stems=None,
+        query_file="query",
+        augment=None,
+    ) -> None:
+        super().__init__(
+            data_root,
+            split,
+            target_length,
+            chunk_size_seconds,
+            query_size_seconds,
+            round_query,
+            min_query_dbfs,
+            min_target_dbfs,
+            min_target_dbfs_step,
+            max_dbfs_tries,
+            top_k_instrument,
+            mixture_stem,
+            use_own_query,
+            npy_memmap,
+            allowed_stems,
+            query_file,
+            augment,
+        )
+        self.stem_to_n_songs = {k: len(v) for k, v in self.stem_to_song.items()}
+        self.trainable_stems = [k for k, v in self.stem_to_n_songs.items() if v > 1]
+        self.n_allowed_stems = len(self.allowed_stems)
+    def __getitem__(self, index: int):
+        stem = self.allowed_stems[index % self.n_allowed_stems]
+        song_ids_with_stem = self.stem_to_song[stem]
+        song_id = song_ids_with_stem[index % self.stem_to_n_songs[stem]]
+        mix_identifier = dict(song_id=song_id)
+        audio = self._get_audio([stem, self.mixture_stem], identifier=mix_identifier, check_dbfs=True)
+        mixture = audio[self.mixture_stem].copy()
+        if self.use_own_query:
+            query_id = song_id
+            query_identifier = dict(song_id=query_id)
+        else:
+            query_id = random.choice(song_ids_with_stem)
+            query_identifier = dict(song_id=query_id)
+        query = self.get_query_stem(stem=stem, identifier=query_identifier)
+        query = query.copy()
+        sources = {"target": audio[stem].copy()}
+        return input_dict(
+            mixture=mixture,
+            sources=sources,
+            query=query,
+            metadata={
+                "mix": mix_identifier,
+                "query": query_identifier,
+                "stem": stem,
+            },
+            modality="audio",
+        )
+class MoisesDBDeterministicChunkDeterministicQueryDataset(
+    MoisesDBRandomChunkRandomQueryDataset
+):
+    def __init__(
+        self,
+        data_root: str,
+        split: str,
+        chunk_size_seconds: float = 4.0,
+        hop_size_seconds: float = 8.0,
+        query_size_seconds: float = 1.0,
+        min_query_dbfs: float = -40.0,
+        top_k_instrument: int = 10,
+        n_queries_per_chunk: int = 1,
+        mixture_stem: str = "mixture",
+        use_own_query: bool = True,
+        npy_memmap=True,
+        allowed_stems: List[str] = None,
+        query_file="query",
+    ) -> None:
+        super().__init__(
+            data_root=data_root,
+            split=split,
+            target_length=None,
+            chunk_size_seconds=chunk_size_seconds,
+            query_size_seconds=query_size_seconds,
+            min_query_dbfs=min_query_dbfs,
+            top_k_instrument=top_k_instrument,
+            mixture_stem=mixture_stem,
+            use_own_query=use_own_query,
+            npy_memmap=npy_memmap,
+            allowed_stems=allowed_stems,
+            query_file=query_file,
+        )
+        if hop_size_seconds is None:
+            hop_size_seconds = chunk_size_seconds
+        self.chunk_hop_size_seconds = hop_size_seconds
+        self.chunk_hop_size_samples = int(hop_size_seconds * self.fs)
+        self.n_queries_per_chunk = n_queries_per_chunk
+        self._overwrite = False
+        self.query_tuples = self.find_query_tuples_or_generate()
+        self.n_chunks = len(self.query_tuples)
+    def __len__(self) -> int:
+        return self.n_chunks
+    def _get_audio(self, stems, identifier: Dict[str, Any]):
+        audio = {}
+        for stem in stems:
+            audio[stem] = self.get_full_stem(stem=stem, identifier=identifier)
+        start = identifier["chunk_start"]
+        end = start + self.chunk_size_samples
+        audio = self._chunk_audio(audio, start, end)
+        return audio
+    def find_query_tuples_or_generate(self):
+        query_path = os.path.join(self.data_path, "queries")
+        val_folds = "-".join(map(str, self.folds))
+        path_so_far = os.path.join(query_path, val_folds)
+        if not os.path.exists(path_so_far):
+            return self.generate_index()
+        chunk_specs = f"chunk{self.chunk_size_samples}-hop{self.chunk_hop_size_samples}"
+        path_so_far = os.path.join(path_so_far, chunk_specs)
+        if not os.path.exists(path_so_far):
+            return self.generate_index()
+        query_specs = f"query{self.query_size_samples}-n{self.n_queries_per_chunk}"
+        path_so_far = os.path.join(path_so_far, query_specs)
+        if not os.path.exists(path_so_far):
+            return self.generate_index()
+        if self.top_k_instrument is not None:
+            path_so_far = os.path.join(
+                path_so_far, f"queries-top{self.top_k_instrument}.csv"
+            )
+        else:
+            if len(self.allowed_stems) > 5:
+                allowed_stems = (
+                    str(len(self.allowed_stems))
+                    + "stems:"
+                    + ":".join([k[0] for k in self.allowed_stems if k != "mixture"])
+                )
+            else:
+                allowed_stems = ":".join(self.allowed_stems)
+            path_so_far = os.path.join(path_so_far, f"queries-{allowed_stems}.csv")
+        if not os.path.exists(path_so_far):
+            return self.generate_index()
+        print(f"Loading query tuples from {path_so_far}")
+        return pd.read_csv(path_so_far)
+    def _get_index_path(self):
+        query_root = os.path.join(self.data_path, "queries")
+        val_folds = "-".join(map(str, self.folds))
+        chunk_specs = f"chunk{self.chunk_size_samples}-hop{self.chunk_hop_size_samples}"
+        query_specs = f"query{self.query_size_samples}-n{self.n_queries_per_chunk}"
+        query_dir = os.path.join(query_root, val_folds, chunk_specs, query_specs)
+        if self.top_k_instrument is not None:
+            query_path = os.path.join(
+                query_dir, f"queries-top{self.top_k_instrument}.csv"
+            )
+        else:
+            if len(self.allowed_stems) > 5:
+                allowed_stems = (
+                    str(len(self.allowed_stems))
+                    + "stems:"
+                    + ":".join([k[0] for k in self.allowed_stems if k != "mixture"])
+                )
+            else:
+                allowed_stems = ":".join(self.allowed_stems)
+            query_path = os.path.join(query_dir, f"queries-{allowed_stems}.csv")
+        if not self._overwrite:
+            assert not os.path.exists(
+                query_path
+            ), f"Query path {query_path} already exists."
+        os.makedirs(query_dir, exist_ok=True)
+        return query_path
+    def generate_index(self):
+        query_path = self._get_index_path()
+        durations = pd.read_csv(os.path.join(self.data_path, "durations.csv"))
+        durations = (
+            durations[["song_id", "duration"]]
+            .set_index("song_id")["duration"]
+            .to_dict()
+        )
+        tuples = []
+        stems_without_queries = defaultdict(list)
+        for i, song_id in tqdm(enumerate(self.files), total=len(self.files)):
+            song_duration = durations[song_id]
+            mix_stems = self.song_to_stem[song_id]
+            n_mix_chunks = math.floor(
+                (song_duration - self.chunk_size_seconds) / self.chunk_hop_size_seconds
+            )
+            for stem in mix_stems:
+                possible_queries = self.stem_to_song[stem]
+                if song_id in possible_queries:
+                    possible_queries.remove(song_id)
+                if len(possible_queries) == 0:
+                    stems_without_queries[song_id].append(stem)
+                    continue
+                for k in tqdm(range(n_mix_chunks), desc=f"song{i + 1}/{stem}"):
+                    mix_chunk_start = int(k * self.chunk_hop_size_samples)
+                    for j in range(self.n_queries_per_chunk):
+                        query = random.choice(possible_queries)
+                        tuples.append(
+                            dict(
+                                mix=song_id,
+                                query=query,
+                                stem=stem,
+                                mix_chunk_start=mix_chunk_start,
+                            )
+                        )
+        if len(stems_without_queries) > 0:
+            print("Stems without queries:")
+            for song_id, stems in stems_without_queries.items():
+                print(f"{song_id}: {stems}")
+        tuples = pd.DataFrame(tuples)
+        print(
+            f"Generating query tuples for {self.split} set with {len(tuples)} tuples."
+        )
+        print(f"Saving query tuples to {query_path}")
+        tuples.to_csv(query_path, index=False)
+        return tuples
+    def index_to_identifiers(self, index: int) -> Tuple[str, str, str, int]:
+        row = self.query_tuples.iloc[index]
+        mix_id = row["mix"]
+        if self.use_own_query:
+            query_id = mix_id
+        else:
+            query_id = row["query"]
+        stem = row["stem"]
+        mix_chunk_start = row["mix_chunk_start"]
+        return mix_id, query_id, stem, mix_chunk_start
+    def __getitem__(self, index: int):
+        mix_id, query_id, stem, mix_chunk_start = self.index_to_identifiers(index)
+        mix_identifier = dict(song_id=mix_id, chunk_start=mix_chunk_start)
+        query_identifier = dict(song_id=query_id)
+        audio = self._get_audio([stem, self.mixture_stem], identifier=mix_identifier)
+        query = self.get_query_stem(stem=stem, identifier=query_identifier)
+        mixture = audio[self.mixture_stem].copy()
+        sources = {"target": audio[stem].copy()}
+        query = query.copy()
+        assert mixture.shape[-1] == self.chunk_size_samples
+        # print(query.shape[-1], self.query_size_samples)
+        assert query.shape[-1] == self.query_size_samples
+        assert sources["target"].shape[-1] == self.chunk_size_samples
+        return input_dict(
+            mixture=mixture,
+            sources=sources,
+            query=query,
+            metadata={
+                "mix": mix_identifier,
+                "query": query_identifier,
+                "stem": stem,
+            },
+            modality="audio",
+        )
+class MoisesDBFullTrackTestQueryDataset(MoisesDBFullTrackDataset):
+    def __init__(
+        self,
+        data_root: str,
+        split: str = "test",
+        top_k_instrument: int = 10,
+        mixture_stem: str = "mixture",
+        use_own_query: bool = True,
+        npy_memmap=True,
+        allowed_stems: List[str] = None,
+        query_file="query-10s",
+    ) -> None:
+        super().__init__(
+            data_root=data_root,
+            split=split,
+            npy_memmap=npy_memmap,
+            recompute_mixture=False,
+            query_file=query_file,
+        )
+        self.use_own_query = use_own_query
+        self.allowed_stems = allowed_stems
+        test_indices = pd.read_csv(os.path.join(data_root, "test_indices.csv"))
+        test_indices = test_indices[test_indices.stem.isin(self.allowed_stems)]
+        self.test_indices = test_indices
+        self.length = len(self.test_indices)
+    def __len__(self) -> int:
+        return self.length
+    def index_to_identifiers(self, index: int) -> Tuple[str, str, str]:
+        row = self.test_indices.iloc[index]
+        mix_id = row["song_id"]
+        if self.use_own_query:
+            query_id = mix_id
+        else:
+            query_id = row["query_id"]
+        stem = row["stem"]
+        return mix_id, query_id, stem
+    def _get_audio(self, stems, identifier: Dict[str, Any]):
+        audio = {}
+        for stem in stems:
+            audio[stem] = self.get_full_stem(stem=stem, identifier=identifier)
+        return audio
+    def __getitem__(self, index: int):
+        mix_id, query_id, stem = self.index_to_identifiers(index)
+        mix_identifier = dict(song_id=mix_id)
+        query_identifier = dict(song_id=query_id)
+        audio = self._get_audio([stem, "mixture"], identifier=mix_identifier)
+        query = self.get_query_stem(stem=stem, identifier=query_identifier)
+        mixture = audio["mixture"].copy()
+        sources = {stem: audio[stem].copy()}
+        query = query.copy()
+        return input_dict(
+            mixture=mixture,
+            sources=sources,
+            query=query,
+            metadata={
+                "mix": mix_identifier["song_id"],
+                "query": query_identifier["song_id"],
+                "stem": stem,
+            },
+            modality="audio",
+        )
+if __name__ == "__main__":
+    print("Beginning")
+    config = "/storage/home/hcoda1/1/kwatchar3/coda/config/data/moisesdb-everything-query-d-aug.yml"
+    config = OmegaConf.load(config)
+    print("Loaded config")
+    dataset = MoisesDBRandomChunkRandomQueryDataset(
+        data_root=config.data_root, split="train", **config.train_kwargs
+    )
+    print("Loaded dataset")
+    for item in tqdm(dataset, total=len(dataset)):
+        target_audio = item["sources"]["target"]["audio"]
+        mixture = item["mixture"]["audio"]
+        if target_audio is None:
+            raise ValueError
+        else:
+            tdb = 10.0 * torch.log10(torch.mean(torch.square(target_audio)) + 1e-6)
+            mdb = 10.0 * torch.log10(torch.mean(torch.square(mixture)) + 1e-6)
+            print(f"Target db: {tdb}, Mixture db: {mdb}")

core/data/moisesdb/eda.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

core/data/moisesdb/npyify.py ADDED Viewed

	@@ -0,0 +1,923 @@

+from collections import defaultdict
+import glob
+import json
+import math
+import os
+import shutil
+from itertools import chain
+from pprint import pprint
+from types import SimpleNamespace
+import numpy as np
+import pandas as pd
+from omegaconf import OmegaConf
+from tqdm.contrib.concurrent import process_map
+from tqdm import tqdm as tdqm, tqdm
+import torchaudio as ta
+import librosa
+taxonomy = {
+    "vocals": [
+        "lead male singer",
+        "lead female singer",
+        "human choir",
+        "background vocals",
+        "other (vocoder, beatboxing etc)",
+    ],
+    "bass": [
+        "bass guitar",
+        "bass synthesizer (moog etc)",
+        "contrabass/double bass (bass of instrings)",
+        "tuba (bass of brass)",
+        "bassoon (bass of woodwind)",
+    ],
+    "drums": [
+        "snare drum",
+        "toms",
+        "kick drum",
+        "cymbals",
+        "overheads",
+        "full acoustic drumkit",
+        "drum machine",
+        "hi-hat"
+    ],
+    "other": [
+        "fx/processed sound, scratches, gun shots, explosions etc",
+        "click track",
+    ],
+    "guitar": [
+        "clean electric guitar",
+        "distorted electric guitar",
+        "lap steel guitar or slide guitar",
+        "acoustic guitar",
+    ],
+    "other plucked": ["banjo, mandolin, ukulele, harp etc"],
+    "percussion": [
+        "a-tonal percussion (claps, shakers, congas, cowbell etc)",
+        "pitched percussion (mallets, glockenspiel, ...)",
+    ],
+    "piano": [
+        "grand piano",
+        "electric piano (rhodes, wurlitzer, piano sound alike)",
+    ],
+    "other keys": [
+        "organ, electric organ",
+        "synth pad",
+        "synth lead",
+        "other sounds (hapischord, melotron etc)",
+    ],
+    "bowed strings": [
+        "violin (solo)",
+        "viola (solo)",
+        "cello (solo)",
+        "violin section",
+        "viola section",
+        "cello section",
+        "string section",
+        "other strings",
+    ],
+    "wind": [
+        "brass (trumpet, trombone, french horn, brass etc)",
+        "flutes (piccolo, bamboo flute, panpipes, flutes etc)",
+        "reeds (saxophone, clarinets, oboe, english horn, bagpipe)",
+        "other wind",
+    ],
+}
+def clean_npy_other_vox(data_root="/storage/home/hcoda1/1/kwatchar3/data/data/moisesdb/npyq"):
+    npys = glob.glob(os.path.join(data_root, "**/*.npy"), recursive=True)
+    npys = [npy for npy in npys if "other" in npy]
+    npys = [npy for npy in npys if "vdbo_" not in npy]
+    npys = [npy for npy in npys if "other_" not in npy]
+    stems = set([
+        os.path.basename(npy).split(".")[0] for npy in npys
+    ])
+    assert len(stems) == 1
+    for npy in tqdm(npys):
+        shutil.move(npy, npy.replace("other", "other_vocals"))
+def clean_track_inst(inst):
+    if "vocoder" in inst:
+        inst = "other_vocals"
+    if "fx" in inst:
+        inst = "fx"
+    if "contrabass_double_bass" in inst:
+        inst = "double_bass"
+    if "banjo" in inst:
+        return "other_plucked"
+    if "(" in inst:
+        inst = inst.split("(")[0]
+    for s in [",", "-"]:
+        if s in inst:
+            inst = inst.replace(s, "")
+    for s in ["/"]:
+        if s in inst:
+            inst = inst.replace(s, "_")
+    if inst[-1] == "_":
+        inst = inst[:-1]
+    return inst
+taxonomy = {
+    k.replace(" ", "_"): [clean_track_inst(i.replace(" ", "_")) for i in v] for k, v in taxonomy.items()
+}
+fine_to_coarse = {}
+for k, v in taxonomy.items():
+    for vv in v:
+        fine_to_coarse[vv] = k
+# pprint(fine_to_coarse)
+def save_taxonomy():
+    with open("taxonomy.json", "w") as f:
+        json.dump(taxonomy, f, indent=4)
+    taxonomy_coarse = list(taxonomy.keys())
+    with open("taxonomy_coarse.json", "w") as f:
+        json.dump(taxonomy_coarse, f, indent=4)
+    taxonomy_fine = list(chain(*taxonomy.values()))
+    count_ = defaultdict(int)
+    for t in taxonomy_fine:
+        count_[t] += 1
+    with open("taxonomy_fine.json", "w") as f:
+        json.dump(taxonomy_fine, f, indent=4)
+possible_coarse = list(taxonomy.keys())
+possible_fine = list(set(chain(*taxonomy.values())))
+def trim_and_mix(audios, length_=None):
+    length = min([a.shape[-1] for a in audios])
+    if length_ is not None:
+        length = min(length, length_)
+    audios = [a[..., :length] for a in audios]
+    return np.sum(np.stack(audios, axis=0), axis=0), length
+def retrim_npys(saved_npy, new_length):
+    print("retrimming")
+    for npy in saved_npy:
+        audio = np.load(npy)
+        audio = audio[..., :new_length]
+        np.save(npy, audio)
+def convert_one(inout):
+    input_path = inout.input_path
+    output_root = inout.output_root
+    song_id = os.path.basename(input_path)
+    output_root = os.path.join(output_root, song_id)
+    os.makedirs(output_root, exist_ok=True)
+    metadata = OmegaConf.load(os.path.join(input_path, "data.json"))
+    stems = metadata.stems
+    min_length = None
+    saved_npy = []
+    all_tracks = []
+    other_tracks = []
+    outfile = None
+    added_tracks = set()
+    duplicated_tracks = set()
+    track_to_stem = defaultdict(list)
+    added_stems = set()
+    duplicated_stems = set()
+    stem_name_to_stems = defaultdict(list)
+    for stem in stems:
+        stem_name = stem.stemName
+        stem_name_to_stems[stem_name].append(stem)
+    for stem_name in tqdm(stem_name_to_stems):
+        stem_tracks = []
+        for stem in stem_name_to_stems[stem_name]:
+            stem_name = stem.stemName
+            if stem_name in added_stems:
+                print(f"Duplicate stem {stem_name} in {song_id}")
+                duplicated_stems.add(stem_name)
+            added_stems.add(stem_name)
+            for track in stem.tracks:
+                track_inst = track.trackType
+                track_inst = clean_track_inst(track_inst)
+                if track_inst in added_tracks:
+                    if stem_name in track_to_stem[track_inst]:
+                        continue
+                    print(f"Duplicate track {track_inst} in {song_id}")
+                    print(f"Stems: {track_to_stem[track_inst]}")
+                    duplicated_tracks.add(track_inst)
+                    raise ValueError
+                else:
+                    added_tracks.add(track_inst)
+                track_to_stem[track_inst].append(stem_name)
+                track_id = track.id
+                audio, fs = ta.load(os.path.join(input_path, stem_name, f"{track_id}.wav"))
+                if fs != 44100:
+                    print(f"fs is {fs} for {track_id}")
+                    with open(os.path.join(output_root, "fs.txt"), "w") as f:
+                        f.write(f"{song_id}\t{track_id}\t{fs}\n")
+                if min_length is None:
+                    min_length = audio.shape[-1]
+                else:
+                    if audio.shape[-1] < min_length:
+                        min_length = audio.shape[-1]
+                        if len(saved_npy) > 0:
+                            retrim_npys(saved_npy, min_length)
+                audio = audio[..., :min_length]
+                audio = audio.numpy()
+                audio = audio.astype(np.float32)
+                if audio.shape[0] == 1:
+                    print("mono")
+                if audio.shape[0] > 2:
+                    print("multi channel")
+                assert outfile is None
+                outfile = os.path.join(output_root, f"{track_inst}.npy")
+                np.save(outfile, audio)
+                saved_npy.append(outfile)
+                outfile = None
+                stem_tracks.append(audio)
+                audio = None
+        stem_track, min_length = trim_and_mix(stem_tracks)
+        assert outfile is None
+        outfile = os.path.join(output_root, f"{stem_name}.npy")
+        np.save(outfile, stem_track)
+        saved_npy.append(outfile)
+        outfile = None
+        all_tracks.append(stem_track)
+        if stem_name not in ["vocals", "drums", "bass"]:
+            # print(f"Putting {stem_name} in other")
+            other_tracks.append(stem_track)
+    assert outfile is None
+    all_track, min_length_ = trim_and_mix(all_tracks, min_length)
+    outfile = os.path.join(output_root, f"mixture.npy")
+    np.save(outfile, all_track)
+    if min_length_ != min_length:
+        retrim_npys(saved_npy, min_length_)
+        min_length = min_length_
+    saved_npy.append(outfile)
+    outfile = None
+    other_track, min_length_ = trim_and_mix(other_tracks, min_length)
+    np.save(os.path.join(output_root, f"vdbo_others.npy"), other_track)
+    if min_length_ != min_length:
+        retrim_npys(saved_npy, min_length_)
+        min_length = min_length_
+def convert_to_npy(
+    data_root="/storage/home/hcoda1/1/kwatchar3/data/data/moisesdb/canonical",
+    output_root="/storage/home/hcoda1/1/kwatchar3/data/data/moisesdb/npy2",
+):
+    if output_root is None:
+        output_root = os.path.join(os.path.dirname(data_root), "npy")
+    files = os.listdir(data_root)
+    files = [
+        os.path.join(data_root, f)
+        for f in files
+        if os.path.isdir(os.path.join(data_root, f))
+    ]
+    inout = [SimpleNamespace(input_path=f, output_root=output_root) for f in files]
+    process_map(convert_one, inout)
+    # for io in tdqm(inout):
+    #     convert_one(io)
+def make_others_one(input_path, dry_run=False):
+    other_stems = [k for k in taxonomy.keys() if k not in ["vocals", "bass", "drums"]]
+    npys = glob.glob(os.path.join(input_path, "**/*.npy"), recursive=True)
+    npys = [npy for npy in npys if ".dbfs" not in npy]
+    npys = [npy for npy in npys if ".query" not in npy]
+    npys = [npy for npy in npys if "mixture" not in npy]
+    npys = [npy for npy in npys if os.path.basename(npy).split(".")[0] in other_stems]
+    print(f"Using stems: {[os.path.basename(npy).split('.')[0] for npy in npys]}")
+    if len(npys) == 0:
+        audio = np.zeros_like(np.load(os.path.join(input_path, "mixture.npy")))
+    else:
+        audio = [np.load(npy) for npy in npys]
+        audio = np.sum(np.stack(audio, axis=0), axis=0)
+    assert audio.shape[0] == 2
+    output = os.path.join(input_path, "vdbo_others.npy")
+    if dry_run:
+        return
+    np.save(output, audio)
+def check_vdbo_one(f):
+    s = np.sum(
+        np.stack(
+            [
+                np.load(os.path.join(f, s + ".npy"))
+                for s in ["vocals", "drums", "bass", "vdbo_others"]
+                if os.path.exists(os.path.join(f, s + ".npy"))
+            ],
+            axis=0,
+        ),
+        axis=0,
+    )
+    m = np.load(os.path.join(f, "mixture.npy"))
+    snr = 10 * np.log10(np.mean(np.square(m)) / np.mean(np.square(s - m)))
+    print(snr)
+    return snr
+def check_vdbo(data_root="/storage/home/hcoda1/1/kwatchar3/data/data/moisesdb/npy2"):
+    files = os.listdir(data_root)
+    files = [
+        os.path.join(data_root, f)
+        for f in files
+        if os.path.isdir(os.path.join(data_root, f))
+    ]
+    snrs = process_map(check_vdbo_one, files)
+    np.save("/storage/home/hcoda1/1/kwatchar3/data/vdbo.npy", np.array(snrs))
+def make_others(data_root="/storage/home/hcoda1/1/kwatchar3/data/data/moisesdb/npy2"):
+    files = os.listdir(data_root)
+    files = [
+        os.path.join(data_root, f)
+        for f in files
+        if os.path.isdir(os.path.join(data_root, f))
+    ]
+    process_map(make_others_one, files)
+    # for f in tqdm(files):
+    #     make_others_one(f, dry_run=False)
+def extract_metadata_one(input_path):
+    song_id = os.path.basename(input_path)
+    metadata = OmegaConf.load(os.path.join(input_path, "data.json"))
+    song = metadata.song
+    artist = metadata.artist
+    genre = metadata.genre
+    stems = metadata.stems
+    data_out = []
+    for stem in stems:
+        stem_name = stem.stemName
+        stem_id = stem.id
+        for track in stem.tracks:
+            track_inst = track.trackType
+            track_id = track.id
+            data_out.append(
+                {
+                    "song_id": song_id,
+                    "song": song,
+                    "artist": artist,
+                    "genre": genre,
+                    "stem_name": stem_name,
+                    "stem_id": stem_id,
+                    "track_inst": track_inst,
+                    "track_id": track_id,
+                    "has_bleed": track.has_bleed,
+                }
+            )
+    return data_out
+def consolidate_metadata(
+    data_root="/home/kwatchar3/Documents/data/moisesdb/canonical",
+):
+    files = os.listdir(data_root)
+    files = [
+        os.path.join(data_root, f)
+        for f in files
+        if os.path.isdir(os.path.join(data_root, f))
+    ]
+    data = process_map(extract_metadata_one, files)
+    df = pd.DataFrame.from_records(list(chain(*data)))
+    df.to_csv(os.path.join(os.path.dirname(data_root), "metadata.csv"), index=False)
+def clean_canonical(data_root="/home/kwatchar3/Documents/data/moisesdb/canonical"):
+    npy = glob.glob(os.path.join(data_root, "**/*.npy"), recursive=True)
+    for n in tqdm(npy):
+        os.remove(n)
+def remove_dbfs(data_root="/storage/home/hcoda1/1/kwatchar3/data/data/moisesdb/npy"):
+    npy = glob.glob(os.path.join(data_root, "**/*.dbfs.npy"), recursive=True)
+    for n in tqdm(npy):
+        os.remove(n)
+def make_split(
+    metadata_path="/home/kwatchar3/Documents/data/moisesdb/metadata.csv",
+    n_splits=5,
+    seed=42,
+):
+    df = pd.read_csv(metadata_path)
+    # print(df.columns)
+    df = df[["song_id", "genre"]].drop_duplicates()
+    genres = df["genre"].value_counts()
+    genres_map = {g: g if c > n_splits else "other" for g, c in genres.items()}
+    df["genre"] = df["genre"].map(genres_map)
+    n_samples = len(df)
+    n_per_split = n_samples // n_splits
+    np.random.seed(seed)
+    from sklearn.model_selection import train_test_split
+    splits = []
+    df_ = df.copy()
+    for i in range(n_splits - 1):
+        df_, test = train_test_split(
+            df_,
+            test_size=n_per_split,
+            random_state=seed,
+            stratify=df_["genre"],
+            shuffle=True,
+        )
+        dfs = test[["song_id"]].copy().sort_values(by="song_id")
+        dfs["split"] = i + 1
+        splits.append(dfs)
+    test = df_
+    dfs = test[["song_id"]].copy().sort_values(by="song_id")
+    dfs["split"] = n_splits
+    splits.append(dfs)
+    splits = pd.concat(splits)
+    splits.to_csv(
+        os.path.join(os.path.dirname(metadata_path), "splits.csv"), index=False
+    )
+def consolidate_stems(data_root="/home/kwatchar3/Documents/data/moisesdb/npy"):
+    metadata = pd.read_csv(os.path.join(os.path.dirname(data_root), "metadata.csv"))
+    dfg = metadata.groupby("song_id")[["stem_name", "track_inst"]]
+    pprint(dfg)
+    df = []
+    def make_stem_dict(song_id, track_inst, stem_names):
+        d = {"song_id": song_id}
+        for inst in possible_fine:
+            d[inst] = int(inst in track_inst)
+        for inst in possible_coarse:
+            d[inst] = int(inst in stem_names)
+        return d
+    for song_id, dfgg in dfg:
+        track_inst = dfgg["track_inst"].tolist()
+        track_inst = list(set(track_inst))
+        track_inst = [clean_track_inst(inst) for inst in track_inst]
+        stem_names = dfgg["stem_name"].tolist()
+        stem_names = list(set([clean_track_inst(inst) for inst in stem_names]))
+        d = make_stem_dict(song_id, track_inst, stem_names)
+        df.append(d)
+    print(df)
+    df = pd.DataFrame.from_records(df)
+    df.to_csv(os.path.join(os.path.dirname(data_root), "stems.csv"), index=False)
+def get_dbfs(data_root="/home/kwatchar3/Documents/data/moisesdb/npy"):
+    npys = glob.glob(os.path.join(data_root, "**/*.npy"), recursive=True)
+    dbfs = []
+    for npy in tqdm(npys):
+        audio = np.load(npy)
+        song_id = os.path.basename(os.path.dirname(npy))
+        track_id = os.path.basename(npy).split(".")[0]
+        dbfs.append(
+            {
+                "song_id": song_id,
+                "track_id": track_id,
+                "dbfs": 10 * np.log10(np.mean(np.square(audio))),
+            }
+        )
+    dbfs = pd.DataFrame.from_records(dbfs)
+    dbfs.to_csv(os.path.join(os.path.dirname(data_root), "dbfs.csv"), index=False)
+    return dbfs
+def get_dbfs_by_chunk_one(inout):
+    audio = np.load(inout.audio_path, mmap_mode="r")
+    chunk_size = inout.chunk_size
+    fs = inout.fs
+    hop_size = inout.hop_size
+    n_chan, n_samples = audio.shape
+    chunk_size_samples = int(chunk_size * fs)
+    hop_size_samples = int(hop_size * fs)
+    x2win = np.lib.stride_tricks.sliding_window_view(
+        np.square(audio), chunk_size_samples, axis=1
+    )[:, ::hop_size_samples, :]
+    x2win_mean = np.mean(x2win, axis=(0, 2))
+    x2win_mean[x2win_mean == 0] = 1e-8
+    dbfs = 10 * np.log10(x2win_mean)
+    # song_id = os.path.basename(os.path.dirname(inout.audio_path))
+    track_id = os.path.basename(inout.audio_path).split(".")[0]
+    np.save(
+        os.path.join(os.path.dirname(inout.audio_path), f"{track_id}.dbfs.npy"), dbfs
+    )
+def clean_data_root(data_root="/home/kwatchar3/Documents/data/moisesdb/npy"):
+    npys = glob.glob(os.path.join(data_root, "**/*.npy"), recursive=True)
+    for npy in tqdm(npys):
+        if ".dbfs" in npy or ".query" in npy:
+            # print("removing", npy)
+            os.remove(npy)
+#
+def get_dbfs_by_chunk(
+    data_root="/home/kwatchar3/Documents/data/moisesdb/npy",
+    query_root="/home/kwatchar3/Documents/data/moisesdb/npyq",
+):
+    npys = glob.glob(os.path.join(data_root, "**/*.npy"), recursive=True)
+    inout = [
+        SimpleNamespace(
+            audio_path=npy,
+            chunk_size=1,
+            hop_size=0.125,
+            fs=44100,
+            output_path=npy.replace(data_root, query_root).replace(
+                ".npy", ".query.npy"
+            ),
+        )
+        for npy in npys
+    ]
+    process_map(get_dbfs_by_chunk_one, inout, chunksize=2)
+def round_samples(seconds, fs, hop_size, downsample):
+    n_frames = math.ceil(seconds * fs / hop_size) + 1
+    n_frames_down = math.ceil(n_frames / downsample)
+    n_frames = n_frames_down * downsample
+    n_samples = (n_frames - 1) * hop_size
+    return int(n_samples)
+def get_query_one(inout):
+    audio = np.load(inout.audio_path, mmap_mode="r")
+    chunk_size = inout.chunk_size
+    fs = inout.fs
+    output_path = inout.output_path
+    round = inout.round
+    hop_size = inout.hop_size
+    if round:
+        chunk_size_samples = round_samples(chunk_size, fs, 512, 2**6)
+    else:
+        chunk_size_samples = int(chunk_size * fs)
+    audio_mono = np.mean(audio, axis=0)
+    onset = librosa.onset.onset_detect(
+        y=audio_mono, sr=fs, units="frames", hop_length=hop_size
+    )
+    onset_strength = librosa.onset.onset_strength(
+        y=audio_mono, sr=fs, hop_length=hop_size
+    )
+    n_frames_per_chunk = chunk_size_samples // hop_size
+    onset_strength_slide = np.lib.stride_tricks.sliding_window_view(
+        onset_strength, n_frames_per_chunk, axis=0
+    )
+    onset_strength = np.mean(onset_strength_slide, axis=1)
+    max_onset_frame = np.argmax(onset_strength)
+    max_onset_samples = librosa.frames_to_samples(max_onset_frame)
+    track_id = os.path.basename(inout.audio_path).split(".")[0]
+    segment = audio[:, max_onset_samples : max_onset_samples + chunk_size_samples]
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    np.save(output_path, segment)
+def get_query_from_onset(
+    data_root="/storage/home/hcoda1/1/kwatchar3/data/data/moisesdb/npy2",  # "/home/kwatchar3/Documents/data/moisesdb/npy",
+    query_root="/storage/home/hcoda1/1/kwatchar3/data/data/moisesdb/npyq",  # "/home/kwatchar3/Documents/data/moisesdb/npyq",
+    query_file="query-10s",
+    pmap=True,
+):
+    npys = glob.glob(os.path.join(data_root, "**/*.npy"), recursive=True)
+    npys = [npy for npy in npys if "dbfs" not in npy]
+    inout = [
+        SimpleNamespace(
+            audio_path=npy,
+            chunk_size=10,
+            hop_size=512,
+            round=False,
+            fs=44100,
+            output_path=npy.replace(data_root, query_root).replace(
+                ".npy", f".{query_file}.npy"
+            ),
+        )
+        for npy in npys
+    ]
+    if pmap:
+        process_map(get_query_one, inout, chunksize=2, max_workers=24)
+    else:
+        for io in tqdm(inout):
+            get_query_one(io)
+def get_durations(data_root="/home/kwatchar3/Documents/data/moisesdb/npy"):
+    npys = glob.glob(os.path.join(data_root, "**/mixture.npy"), recursive=True)
+    durations = []
+    for npy in tqdm(npys):
+        audio = np.load(npy, mmap_mode="r")
+        song_id = os.path.basename(os.path.dirname(npy))
+        track_id = os.path.basename(npy).split(".")[0]
+        durations.append(
+            {
+                "song_id": song_id,
+                "track_id": track_id,
+                "duration": audio.shape[-1] / 44100,
+            }
+        )
+    durations = pd.DataFrame.from_records(durations)
+    durations.to_csv(
+        os.path.join(os.path.dirname(data_root), "durations.csv"), index=False
+    )
+    return durations
+def clean_query_root(
+    data_root="/home/kwatchar3/Documents/data/moisesdb/npy",
+    query_root="/home/kwatchar3/Documents/data/moisesdb/npyq",
+):
+    npys = glob.glob(os.path.join(data_root, "**/*.query.npy"), recursive=True)
+    for npy in tqdm(npys):
+        dst = npy.replace(data_root, query_root)
+        dstdir = os.path.dirname(dst)
+        os.makedirs(dstdir, exist_ok=True)
+        shutil.move(npy, dst)
+def make_test_indices(
+    metadata_path="/storage/home/hcoda1/1/kwatchar3/data/data/moisesdb/metadata.csv",
+    stem_path="/storage/home/hcoda1/1/kwatchar3/data/data/moisesdb/stems.csv",
+    splits_path="/storage/home/hcoda1/1/kwatchar3/data/data/moisesdb/splits.csv",
+    test_split=5,
+):
+    coarse_stems = set(taxonomy.keys())
+    fine_stems = set(chain(*taxonomy.values()))
+    metadata = pd.read_csv(metadata_path)
+    splits = pd.read_csv(splits_path)
+    stems = pd.read_csv(stem_path)
+    file_in_test = splits[splits["split"] == test_split]["song_id"].tolist()
+    stems_test = stems[stems["song_id"].isin(file_in_test)]
+    metadata_test = metadata[metadata["song_id"].isin(file_in_test)]
+    splits_test = splits[splits["split"] == test_split]
+    stems_test = stems_test.set_index("song_id")
+    metadata_test = metadata_test.drop_duplicates("song_id").set_index("song_id")
+    splits_test = splits_test.set_index("song_id")
+    stem_to_song_id = defaultdict(list)
+    song_id_to_stem = defaultdict(list)
+    for song_id in file_in_test:
+        stems_ = stems_test.loc[song_id]
+        stem_names = stems_.T
+        stem_names = stem_names[stem_names == 1].index.tolist()
+        for stem in stem_names:
+            stem_to_song_id[stem].append(song_id)
+        song_id_to_stem[song_id] = stem_names
+    indices = []
+    no_query = []
+    for song_id in file_in_test:
+        genre = metadata_test.loc[song_id, "genre"]
+        # print(genre)
+        artist = metadata_test.loc[song_id, "artist"]
+        # print(artist)
+        stems_ = song_id_to_stem[song_id]
+        for stem in stems_:
+            possible_query = stem_to_song_id[stem]
+            possible_query = [p for p in possible_query if p != song_id]
+            if len(possible_query) == 0:
+                print(f"No possible query for {song_id} with {stem}")
+                no_query.append(
+                    {
+                        "song_id": song_id,
+                        "stem": stem
+                    }
+                )
+                continue
+            query_df = metadata_test.loc[possible_query, ["genre", "artist"]]
+            assert len(query_df) > 0
+            query_df_ = query_df.copy()
+            same_genre = True
+            different_artist = True
+            query_df = query_df[(query_df["genre"] == genre) & (query_df["artist"] != artist)]
+            if len(query_df) == 0:
+                same_genre = False
+                different_artist = True
+                query_df = query_df_.copy()
+                query_df = query_df[(query_df["artist"] != artist)]
+            if len(query_df) == 0:
+                same_genre = True
+                different_artist = False
+                query_df = query_df_.copy()
+                query_df = query_df[(query_df["genre"] == genre)]
+            if len(query_df) == 0:
+                same_genre = False
+                different_artist = False
+                query_df = query_df_.copy()
+            query_id = query_df.sample(1).index[0]
+            indices.append(
+                {
+                    "song_id": song_id,
+                    "query_id": query_id,
+                    "stem": stem,
+                    "same_genre": same_genre,
+                    "different_artist": different_artist
+                }
+            )
+    indices = pd.DataFrame.from_records(indices)
+    no_query = pd.DataFrame.from_records(no_query)
+    indices.to_csv(
+        os.path.join(os.path.dirname(metadata_path), "test_indices.csv"), index=False
+    )
+    no_query.to_csv(
+        os.path.join(os.path.dirname(metadata_path), "no_query.csv"), index=False
+    )
+    print("Total number of queries:", len(indices))
+    print("Total number of no queries:", len(no_query))
+    query_type = indices.groupby(["same_genre", "different_artist"]).size()
+    print(query_type)
+if __name__ == "__main__":
+    import fire
+    fire.Fire()

core/data/moisesdb/passt.ipynb ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "\n",
+    "\n",
+    "data_root = \"/storage/home/hcoda1/1/kwatchar3/data/data/moisesdb/passt\"\n",
+    "\n",
+    "files = glob.glob(data_root + \"/*.passt.npy\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

core/losses/__init__.py ADDED Viewed

File without changes

core/losses/base.py ADDED Viewed

	@@ -0,0 +1,171 @@

+from typing import Dict, List, Optional, Union
+from torch import nn
+import torch
+from torch.nn.modules.loss import _Loss
+from core.types import BatchedInputOutput
+from torch.nn import functional as F
+class BaseLossHandler(nn.Module):
+    def __init__(
+        self, loss: nn.Module, modality: Union[str, List[str]], name: Optional[str] = None
+    ) -> None:
+        super().__init__()
+        self.loss = loss
+        if isinstance(modality, str):
+            modality = [modality]
+        self.modality = modality
+        if name is None:
+            name = "loss"
+        if name == "__auto__":
+            name = self.loss.__class__.__name__
+        self.name = name
+    def _audio_preprocess(self, y_pred, y_true):
+        n_sample_true = y_true.shape[-1]
+        n_sample_pred = y_pred.shape[-1]
+        if n_sample_pred > n_sample_true:
+            y_pred = y_pred[..., :n_sample_true]
+        elif n_sample_pred < n_sample_true:
+            y_true = y_true[..., :n_sample_pred]
+        return y_pred, y_true
+    def forward(self, batch: BatchedInputOutput):
+        y_true = batch.sources
+        y_pred = batch.estimates
+        loss_contribs = {}
+        stem_contribs = {
+            stem: 0.0 for stem in y_pred.keys()
+        }
+        for stem in y_pred.keys():
+            for modality in self.modality:
+                if modality not in y_pred[stem].keys():
+                    continue
+                if y_pred[stem][modality].shape[-1] == 0:
+                    continue
+                y_true_ = y_true[stem][modality]
+                y_pred_ = y_pred[stem][modality]
+                if modality == "audio":
+                    y_pred_, y_true_ = self._audio_preprocess(y_pred_, y_true_)
+                elif modality == "spectrogram":
+                    y_pred_ = torch.view_as_real(y_pred_)
+                    y_true_ = torch.view_as_real(y_true_)
+                loss_contribs[f"{self.name}/{stem}/{modality}"] = self.loss(
+                    y_pred_, y_true_
+                )
+                stem_contribs[stem] += loss_contribs[f"{self.name}/{stem}/{modality}"]
+        total_loss = sum(stem_contribs.values())
+        loss_contribs[self.name] = total_loss
+        with torch.no_grad():
+            for stem in stem_contribs.keys():
+                loss_contribs[f"{self.name}/{stem}"] = stem_contribs[stem]
+        return loss_contribs
+class AdversarialLossHandler(BaseLossHandler):
+    def __init__(self, loss: nn.Module, modality: str, name: Optional[str] = "adv_loss"):
+        super().__init__(loss, modality, name)
+    def discriminator_forward(self, batch: BatchedInputOutput):
+        y_true = batch.sources
+        y_pred = batch.estimates
+        # g_loss_contribs = {}
+        d_loss_contribs = {}
+        for stem in y_pred.keys():
+            if self.modality not in y_pred[stem].keys():
+                continue
+            if y_pred[stem][self.modality].shape[-1] == 0:
+                continue
+            y_true_ = y_true[stem][self.modality]
+            y_pred_ = y_pred[stem][self.modality]
+            if self.modality == "audio":
+                y_pred_, y_true_ = self._audio_preprocess(y_pred_, y_true_)
+            # g_loss_contribs[f"{self.name}:g/{stem}"] = self.loss.generator_loss(
+            #     y_pred_, y_true_
+            # )
+            d_loss_contribs[f"{self.name}:d/{stem}"] = self.loss.discriminator_loss(
+                y_pred_, y_true_
+            )
+        # g_total_loss = sum(g_loss_contribs.values())
+        d_total_loss = sum(d_loss_contribs.values())
+        # g_loss_contribs["loss"] = g_total_loss
+        d_loss_contribs["disc_loss"] = d_total_loss
+        return d_loss_contribs
+    def generator_forward(self, batch: BatchedInputOutput):
+        y_true = batch.sources
+        y_pred = batch.estimates
+        g_loss_contribs = {}
+        # d_loss_contribs = {}
+        for stem in y_pred.keys():
+            if self.modality not in y_pred[stem].keys():
+                continue
+            if y_pred[stem][self.modality].shape[-1] == 0:
+                continue
+            y_true_ = y_true[stem][self.modality]
+            y_pred_ = y_pred[stem][self.modality]
+            if self.modality == "audio":
+                y_pred_, y_true_ = self._audio_preprocess(y_pred_, y_true_)
+            g_loss_contribs[f"{self.name}:g/{stem}"] = self.loss.generator_loss(
+                y_pred_, y_true_
+            )
+            # d_loss_contribs[f"{self.name}:g/{stem}"] = self.loss.discriminator_loss(
+            #     y_pred_, y_true_
+            # )
+        g_total_loss = sum(g_loss_contribs.values())
+        # d_total_loss = sum(d_loss_contribs.values())
+        g_loss_contribs["gen_loss"] = g_total_loss
+        # d_loss_contribs["loss"] = d_total_loss
+        return g_loss_contribs
+    def forward(self, batch: BatchedInputOutput):
+        return {
+            "generator": self.generator_forward(batch),
+            "discriminator": self.discriminator_forward(batch)
+        }

core/losses/l1snr.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import torch
+from torch.nn.modules.loss import _Loss
+import torch.nn.functional as F
+class WeightedL1Loss(_Loss):
+    def __init__(self, weights=None):
+        super().__init__()
+    def forward(self, y_pred, y_true):
+        ndim = y_pred.ndim
+        dims = list(range(1, ndim))
+        loss = F.l1_loss(y_pred, y_true, reduction='none')
+        loss = torch.mean(loss, dim=dims)
+        weights = torch.mean(torch.abs(y_true), dim=dims)
+        loss = torch.sum(loss * weights) / torch.sum(weights)
+        return loss
+class L1MatchLoss(_Loss):
+    def __init__(self):
+        super().__init__()
+    def forward(self, y_pred, y_true):
+        batch_size = y_pred.shape[0]
+        y_pred = y_pred.reshape(batch_size, -1)
+        y_true = y_true.reshape(batch_size, -1)
+        l1_true = torch.mean(torch.abs(y_true), dim=-1)
+        l1_pred = torch.mean(torch.abs(y_pred), dim=-1)
+        loss = torch.mean(torch.abs(l1_pred - l1_true))
+        return loss
+class DecibelMatchLoss(_Loss):
+    def __init__(self, eps=1e-3):
+        super().__init__()
+        self.eps = eps
+    def forward(self, y_pred, y_true):
+        batch_size = y_pred.shape[0]
+        y_pred = y_pred.reshape(batch_size, -1)
+        y_true = y_true.reshape(batch_size, -1)
+        db_true = 10.0 * torch.log10(self.eps + torch.mean(torch.square(torch.abs(y_true)), dim=-1))
+        db_pred = 10.0 * torch.log10(self.eps + torch.mean(torch.square(torch.abs(y_pred)), dim=-1))
+        loss = torch.mean(torch.abs(db_pred - db_true))
+        return loss
+class L1SNRLoss(_Loss):
+    def __init__(self, eps=1e-3):
+        super().__init__()
+        self.eps = torch.tensor(eps)
+    def forward(self, y_pred, y_true):
+        batch_size = y_pred.shape[0]
+        y_pred = y_pred.reshape(batch_size, -1)
+        y_true = y_true.reshape(batch_size, -1)
+        l1_error = torch.mean(torch.abs(y_pred - y_true), dim=-1)
+        l1_true = torch.mean(torch.abs(y_true), dim=-1)
+        snr = 20.0 * torch.log10((l1_true + self.eps) / (l1_error + self.eps))
+        return -torch.mean(snr)
+class L1SNRLossIgnoreSilence(_Loss):
+    def __init__(self, eps=1e-3, dbthresh=-20, dbthresh_step=20):
+        super().__init__()
+        self.eps = torch.tensor(eps)
+        self.dbthresh = dbthresh
+        self.dbthresh_step = dbthresh_step
+    def forward(self, y_pred, y_true):
+        batch_size = y_pred.shape[0]
+        y_pred = y_pred.reshape(batch_size, -1)
+        y_true = y_true.reshape(batch_size, -1)
+        l1_error = torch.mean(torch.abs(y_pred - y_true), dim=-1)
+        l1_true = torch.mean(torch.abs(y_true), dim=-1)
+        snr = 20.0 * torch.log10((l1_true + self.eps) / (l1_error + self.eps))
+        db = 10.0 * torch.log10(torch.mean(torch.square(y_true), dim=-1) + 1e-6)
+        if torch.sum(db > self.dbthresh) == 0:
+            if torch.sum(db > self.dbthresh - self.dbthresh_step) == 0:
+                return -torch.mean(snr)
+            else:
+                return -torch.mean(snr[db > self.dbthresh  - self.dbthresh_step])
+        return -torch.mean(snr[db > self.dbthresh])
+class L1SNRDecibelMatchLoss(_Loss):
+    def __init__(self, db_weight=0.1, l1snr_eps=1e-3, dbeps=1e-3):
+        super().__init__()
+        self.l1snr = L1SNRLoss(l1snr_eps)
+        self.decibel_match = DecibelMatchLoss(dbeps)
+        self.db_weight = db_weight
+    def forward(self, y_pred, y_true):
+        return self.l1snr(y_pred, y_true) + self.decibel_match(y_pred, y_true)