gin/ddspae-cnn.gin

# -*-Python-*-
# Decodes from (loudness, f0). Has a trainable reverb component as well.
# Since it uses a trainable reverb, training data should all be from the same
# acoustic environment.

include 'models/ae.gin'

import thesis.dilated_conv

# Encoder
Autoencoder.encoder = None

# Decoder
Autoencoder.decoder = @CustomDilatedConvDecoder()

CustomDilatedConvDecoder:
    ch = 128
    layers_per_stack = 3
    kernel_size = 3
    dilation = 3

    input_keys = ('ld_scaled', 'f0_scaled')

    stacks = 2
    # No resampling unlike in RAVE - DDSP expects the same sampling rate as for the f0 signal from the encoder.
#    resample_stride = 2

#    output_splits = (('control_embedding', %decoder_output_channels),)
    output_splits = (('amps', 1),
                              ('harmonic_distribution', 60),
                              ('noise_magnitudes', 65))

    resample_after_convolve = False

    causal = False

# ==============
# ProcessorGroup
# ==============

ProcessorGroup.dag = [
  (@synths.Harmonic(),
    ['amps', 'harmonic_distribution', 'f0_hz']),
  (@synths.FilteredNoise(),
    ['noise_magnitudes']),
  (@processors.Add(),
    ['filtered_noise/signal', 'harmonic/signal']),
  (@effects.Reverb(),
    ['add/signal']),
]

# Reverb
Reverb.name = 'reverb'
Reverb.reverb_length = 16000
Reverb.trainable = True
#Reverb.decay_after = 16000

# Log spectrograms.
train_util.train.model_specific_summary_fn = @summarize_ddspae