diff --git a/channel/gewechat/gewechat_channel.py b/channel/gewechat/gewechat_channel.py index 586612ece..77974125a 100644 --- a/channel/gewechat/gewechat_channel.py +++ b/channel/gewechat/gewechat_channel.py @@ -13,6 +13,7 @@ from common.utils import compress_imgfile, fsize from config import conf, save_config from lib.gewechat import GewechatClient +from voice.audio_convert import mp3_to_silk MAX_UTF8_LEN = 2048 @@ -113,8 +114,19 @@ def send(self, reply: Reply, context: Context): logger.info("[gewechat] Do send text to {}: {}".format(receiver, reply_text)) elif reply.type == ReplyType.VOICE: try: - # TODO: mp3 to silk content = reply.content + if content.endswith('.mp3'): + # 如果是mp3文件,转换为silk格式 + silk_path = content + '.silk' + silk_path = mp3_to_silk(content, silk_path) + callback_url = conf().get("gewechat_callback_url") + silk_url = callback_url + "?file=" + silk_path + # TODO: 计算silk文件语音时长,暂时都设置为5s + self.client.post_voice(self.app_id, receiver, silk_url, 5000) + logger.info("[gewechat] Do send voice to {}: {}".format(receiver, silk_url)) + return + else: + logger.error(f"[gewechat] voice file is not mp3, path: {content}, only support mp3") except Exception as e: logger.error(f"[gewechat] send voice failed: {e}") elif reply.type == ReplyType.IMAGE_URL: @@ -136,10 +148,13 @@ class Query: def GET(self): params = web.input(file="") if params.file: - if os.path.exists(params.file): - with open(params.file, 'rb') as f: + # TODO: 只允许访问tmp目录下的文件 + # raise web.forbidden() + if os.path.exists(file_path): + with open(file_path, 'rb') as f: return f.read() else: + logger.error(f"[gewechat] File not found: {file_path}") raise web.notfound() return "gewechat callback server is running" diff --git a/requirements-optional.txt b/requirements-optional.txt index 52ad29edc..5895fdab7 100644 --- a/requirements-optional.txt +++ b/requirements-optional.txt @@ -11,6 +11,7 @@ edge-tts # edge-tts numpy<=1.24.2 langid # language detect elevenlabs==1.0.3 # elevenlabs TTS +pilk==0.2.4 # process wechat silk file #install plugin dulwich diff --git a/voice/audio_convert.py b/voice/audio_convert.py index 426367883..166c2d209 100644 --- a/voice/audio_convert.py +++ b/voice/audio_convert.py @@ -1,3 +1,4 @@ +import os import shutil import wave @@ -10,6 +11,11 @@ from pydub import AudioSegment +try: + import pilk +except ImportError: + logger.warning("import pilk failed, silk voice conversion will not be supported. Try: pip install pilk") + sil_supports = [8000, 12000, 16000, 24000, 32000, 44100, 48000] # slk转wav时,支持的采样率 @@ -43,15 +49,43 @@ def get_pcm_from_wav(wav_path): def any_to_mp3(any_path, mp3_path): """ 把任意格式转成mp3文件 - """ - if any_path.endswith(".mp3"): - shutil.copy2(any_path, mp3_path) - return - if any_path.endswith(".sil") or any_path.endswith(".silk") or any_path.endswith(".slk"): - sil_to_wav(any_path, any_path) - any_path = mp3_path - audio = AudioSegment.from_file(any_path) - audio.export(mp3_path, format="mp3") + + Args: + any_path: 输入文件路径 + mp3_path: 输出的mp3文件路径 + """ + try: + # 如果已经是mp3格式,直接复制 + if any_path.endswith(".mp3"): + shutil.copy2(any_path, mp3_path) + return + + # 如果是silk格式,使用pilk转换 + if any_path.endswith((".sil", ".silk", ".slk")): + # 先转成PCM + pcm_path = any_path + '.pcm' + pilk.decode(any_path, pcm_path) + + # 再用pydub把PCM转成MP3 + # TODO: 下面的参数可能需要调整 + audio = AudioSegment.from_raw(pcm_path, format="raw", + frame_rate=24000, + channels=1, + sample_width=2) # 16-bit PCM = 2 bytes + audio.export(mp3_path, format="mp3") + + # 清理临时PCM文件 + import os + os.remove(pcm_path) + return + + # 其他格式使用pydub转换 + audio = AudioSegment.from_file(any_path) + audio.export(mp3_path, format="mp3") + + except Exception as e: + logger.error(f"转换文件到mp3失败: {str(e)}") + raise def any_to_wav(any_path, wav_path): @@ -87,6 +121,32 @@ def any_to_sil(any_path, sil_path): f.write(silk_data) return audio.duration_seconds * 1000 +def mp3_to_silk(mp3_path: str, silk_path: str) -> str: + """Convert MP3 file to SILK format + Args: + mp3_path: Path to input MP3 file + silk_path: Path to output SILK file + Returns: + Path to output SILK file + """ + # First load the MP3 file + audio = AudioSegment.from_file(mp3_path) + + # Convert to mono and set sample rate to 24000Hz + # TODO: 下面的参数可能需要调整 + audio = audio.set_channels(1) + audio = audio.set_frame_rate(24000) + + # Export to PCM + pcm_path = os.path.splitext(mp3_path)[0] + '.pcm' + audio.export(pcm_path, format='s16le') + + # Convert PCM to SILK + pilk.encode(pcm_path, silk_path, pcm_rate=24000, tencent=True) + + # Clean up temporary PCM file + os.remove(pcm_path) + return silk_path def any_to_amr(any_path, amr_path): """