forked from yurimarcon/dubbing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvoice_generator.py
74 lines (63 loc) · 2.41 KB
/
voice_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import whisper
import torch
import sys
import json
import os
import glob
from TTS.api import TTS
from pydub import AudioSegment
from utils_audio import (
get_silence_ranges,
get_initial_silence_duration,
get_speed_factory,
remove_silence_unecessery,
put_silence_to_ajust_time,
ajust_time_segments,
adjust_segment_speed
)
from utils_translate import translate_text
from utils_voice_generator import combine_adjusted_segments, get_files_path
from config import VOICE_MODEL, PATH_RELATIVE, FILE_NAME_SEGMENT, FILE_NAME_ADJUSTED_TEMP, FILE_NAME_ADJUSTED, ORIGINAL_AUDIO, OUTPUT_AUDIO
# Command line arguments
input_audio = sys.argv[1]
input_transcript_text = sys.argv[2]
output_audio = sys.argv[3]
source_lang = sys.argv[4]
dest_language = sys.argv[5]
# Initialize TTS model
tts_model = TTS(
model_name="tts_models/multilingual/multi-dataset/xtts_v2",
progress_bar=False,
gpu=torch.cuda.is_available()
)
def load_transcript(file_path):
with open(file_path, 'r') as file:
return json.load(file)['segments']
def generate_audio_segments(segments, speaker_wav, dest_folder, tts_model=tts_model):
for idx, segment in enumerate(segments):
text = translate_text(segment['text'], source_lang, dest_language) if dest_language != "en" else segment['text']
tts_model.tts_to_file(
text=text,
speaker_wav=speaker_wav,
language=dest_language,
file_path=os.path.join(dest_folder, f"{FILE_NAME_SEGMENT}{idx}.wav")
)
print(f"{idx + 1}/{len(segments)} segments created.")
def clean_up_files(pattern):
files = glob.glob(pattern)
for file in files:
os.remove(file)
def main():
segments = load_transcript(input_transcript_text)
generate_audio_segments(segments, input_audio, PATH_RELATIVE, tts_model)
for idx, segment in enumerate(segments):
initial_file, adjusted_file, final_chunk_file = get_files_path(idx)
adjust_segment_speed(segment, initial_file, adjusted_file)
remove_silence_unecessery(adjusted_file)
put_silence_to_ajust_time(segment, adjusted_file)
adjust_segment_speed(segment, adjusted_file, final_chunk_file)
combine_adjusted_segments(segments, input_audio, PATH_RELATIVE, OUTPUT_AUDIO)
ajust_time_segments(ORIGINAL_AUDIO, OUTPUT_AUDIO)
clean_up_files(os.path.join(PATH_RELATIVE, "segment*"))
if __name__ == "__main__":
main()