StateMachine/SM_test_processes.yaml

#---------------------------------------#
#        FINITE TOOLS PARAMETERS        #
#---------------------------------------#

# Tool 0: Read Audio (from file to NAO)
T0_ReadAudio: {
  python_version: "2.7",
  port: 5000,
  input_wav_file: data/live/audio_generated.wav,
}

# Tool 1: Person Recognition
T1_PersonRecognition: {
  python_version: "3.11",
  port: 5001,
  output_text_file: data/live/person_recognized.txt,
  input_audio_file: data/live/audio_recorded.wav,
  input_people_folder: data/stored/people/,
  input_model_dir: data/pretrained_models/,
  rejection_threshold: 0.17,
}

# Tool 2: Text Generation
T2_TextGeneration: {
  python_version: "3.11",
  port: 5002,
  input_text_file: data/live/text_prompt.txt,
  output_text_file: data/live/text_generated.txt,
  model_name: "openchat", # gemma, llama2, llama3, mistral, openbuddy-mistral, openchat, openhermes, qwen:4b, qwen:7b, vigogne, vigostral, phi3
  ollama_generate_options: {
    "top_k": 50,
    "top_p": 0.95,
    "temperature": 0.8,
    "repeat_penalty": 1.9,
    "repeat_last_n": 100,
  }
}

# Tool 3: Text to Speech
T3_TTS: {
  python_version: "3.10",
  port: 5003,
  model_name: "tts_models/multilingual/multi-dataset/your_tts",
  # "tts_models/multilingual/multi-dataset/your_tts", "tts_models/fr/css10/vits", "tts_models/multilingual/multi-dataset/xtts_v2", "tts_models/fr/mai/tacotron2-DDC"
  language: "fr-fr", # "fr-fr", "fr"
  input_text_file: data/live/text_to_say.txt,
  output_wav_file: data/live/audio_generated.wav,
  speaker_wav_file: data/stored/assistant/voices/pierre_faury.mp3,
  # eliot_christon, example_reference, jean_pierre_pernaut, perrine_laffont, ptisham, roberto_caurand, teddy_riner, pierre_faury, eugenie_declaron, thomas_oxisoglou
}

# Tool 4: Action Selection
T4_ActionSelection: {
  python_version: "3.11",
  port: 5004,
  input_text_file: data/live/text_to_say.txt,
  output_text_file: data/live/action_selected.txt,
  pretrained_model_folder: data/pretrained_models/action_classifier/,
  action_selection_dataset_path: data/stored/assistant/action_selection_training/action_dataset.csv,
}

# Tool 5: Perform Action
# T5_PerformAction: {
#   python_version: "3.11",
#   port: 5005,
# }

T9_LEDS: {
  python_version: "2.7",
  port: 5009,
  input_text_file: data/live/led_rgb.txt,
}

# T10_RetrieveAndAugment: {
#   python_version: "3.11",
#   port: 5010,
#   input_text_file: data/live/text_transcribed.txt,
#   input_database_folder: data/stored/documents/,
#   output_text_file: data/live/documents_context.txt,
#   output_csv_file: data/live/documents_context.csv,
#   load_directory: data/pretrained_models/vectordb/,
#   input_additional_files: [],
#   number_of_results: 3,
#   chunk_size: 4000,
# }

#---------------------------------------#
#       INFINITE TOOLS PARAMETERS       #
#---------------------------------------#

# Tool 6: Record Audio (from NAO)
T6_RecordAudio: {
  python_version: "2.7",
  port: 5006,
  channel: 2,
  output_wav_file: data/live/audio_recorded.wav,
  output_speech_detected_file: data/live/time_speech_detected.txt,
  seconds_to_keep: 15,
  sample_rate: 48000,
  loudness_threshold: 0.65,
}

# Tool 7: Capture Images (from NAO)
# T7_CaptureImages: {
#   python_version: "2.7",
#   port: 5007,
#   output_image_folder: data/live/images/,
#   number_of_images: 10,
#   interval: 1,
#   video_device_args: {resolution: 2, color_space: 11, fps: 30},
#   face_detection_time_ms: 500,
#   image_args: {width: 248, height: 248},
# }

# Tool 8: Speech to Text
T8_STT: {
  python_version: "3.11",
  port: 5008,
  model_size: medium,
  input_wav_file: data/live/audio_recorded.wav,
  output_text_file: data/live/text_transcribed.txt,
}

# Tool 11: Gesture (for NAO)
T11_Gesture: {
  python_version: "2.7",
  port: 5011,
}