diff --git a/.gitignore b/.gitignore index 3966cdf..a41f426 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ old/ *.exe build/ dist/ -*.spec \ No newline at end of file +*.spec +whisper_models/ diff --git a/README.md b/README.md index 3a2b5eb..b45a1ae 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,32 @@ ![image-20230926124922726](md/README/image-20230926124922726.png) +## 自动生成字幕 + +本项目提供自动生成字幕功能,使用openai的[whisper](https://github.com/openai/whisper)项目及其模型在本地进行语音转文字生成字幕。 + +最好使用GPU运行,否则速度较慢。 + +由于涉及到的库较多,打包生成的可执行文件较大,目前暂不发布打包的可执行文件,需要python环境运行,配置python环境见下文依赖部分。 + +运行gen_caption.py为指定视频生成字幕: + +```bash +python gen_caption.py video_path +``` + +或输入数字选择视频: + +```bash +python gen_caption.py +``` + +![2024-04-08_17-42](md/README/2024-04-08_17-42.png) + +等待程序运行完成,生成的字幕文件为`.srt`格式,与视频文件在同级目录下,用支持字幕的播放器打开视频即可看到带字幕的视频。 + +tips: 生成字幕的时间较长,可以先观看视频,字幕生成好了再重新打开视频享受字幕。使用GPU大约需要几分钟,若无GPU则不建议使用本项目提供的字幕功能,可自行寻找其他生成字幕的工具。 + ## 依赖 * ffmpeg,已在Release中提供。若在Linux环境下运行,需手动安装ffmpeg: @@ -35,15 +61,22 @@ sudo apt update sudo apt install ffmpeg ``` -*若想用python环境运行,需安装这些依赖* +*若想用python环境运行,需安装以下依赖* * python,[下载](https://www.python.org/ftp/python/3.9.4/python-3.9.4-amd64.exe)并安装 * python第三方库requests。打开命令行(按win+r,在打开的窗口中输入cmd,回车),运行如下命令安装: - ```bash - pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple - ``` +```bash +pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple +``` + +**若使用GPU运行自动生成字幕功能,需要先安装cuda版本的pytorch,具体安装方法见[pytorch官网](https://pytorch.org/get-started/locally/)** + +安装whisper:(依赖于pytorch,若未安装pytorch,会自动安装,但是cpu版本) +```bash +pip install -r requirements_whisper.txt -i https://pypi.tuna.tsinghua.edu.cn/simple +``` ## 注意 @@ -61,7 +94,5 @@ sudo apt install ffmpeg pip install pyinstaller # 打包 pyinstaller -F main.py +pyinstaller -F gen_caption.py ``` - -## 语音转文字 - diff --git a/gen_caption.py b/gen_caption.py index 31bb6d0..36177c0 100644 --- a/gen_caption.py +++ b/gen_caption.py @@ -1,7 +1,8 @@ import whisper import time from zhconv import convert # 简繁体转换 -# from moviepy.editor import VideoFileClip +import sys +import os def seconds_to_hmsm(seconds): @@ -29,30 +30,28 @@ def seconds_to_hmsm(seconds): def main(): # 视频文件路径 - video_path = 'output/深度学习-video/深度学习-任雪梅-第4周 星期三 第4大节.mp4' - # 输出音频文件路径 - # audio_output_path = 'output/audio.mp3' + if len(sys.argv) >= 2: + video_path = sys.argv[1] + else: + files = [] + for dirpath, dirnames, filenames in os.walk('output/'): + for filename in filenames: + if filename.endswith('.mp4'): + files.append(os.path.join(dirpath, filename)) + for i, f in enumerate(files): + print(i, ":", f) + video_path = files[eval(input('select a video file by input a num: '))] - # # 加载视频文件 - # video = VideoFileClip(video_path) - - # # 30秒的音频 - # audio = video.audio.subclip(60, 2*60) - - # # 保存音频文件 - # audio.write_audiofile(audio_output_path) - - # # 释放资源 - # video.close() + audio_path = video_path.replace("mp4", "m4a") + cmd = f"ffmpeg -i '{video_path}' -vn -ar {whisper.audio.SAMPLE_RATE} '{audio_path}'" + os.system(cmd) model = whisper.load_model("base", download_root="whisper_models/") start = time.time() - result = model.transcribe(video_path, verbose=False, language="zh") + result = model.transcribe(audio_path, verbose=False, language="zh") print("Time cost: ", time.time() - start) - for segment in result["segments"]: - print(segment["start"], segment["end"], segment["text"]) # 写入字幕文件 with open(video_path.replace("mp4", "srt"), 'w', encoding='utf-8') as f: i = 1 @@ -63,6 +62,9 @@ def main(): i += 1 f.write(convert(r['text'], 'zh-cn')+'\n') # 结果可能是繁体,转为简体zh-cn f.write('\n') + + # 删除音频文件 + os.remove(audio_path) if __name__ == '__main__': diff --git a/md/README/2024-04-08_17-42.png b/md/README/2024-04-08_17-42.png new file mode 100644 index 0000000..2d4ec4b Binary files /dev/null and b/md/README/2024-04-08_17-42.png differ diff --git a/requirements-whisper.txt b/requirements-whisper.txt new file mode 100644 index 0000000..640ffc4 --- /dev/null +++ b/requirements-whisper.txt @@ -0,0 +1,2 @@ +openai-whisper +zhconv diff --git a/requirements.txt b/requirements.txt index e6a0d0e..f229360 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1 @@ requests -openai-whisper -zhconv