feat: support gewechat voice (#171)

hanfangyuan4396 · Dec 28, 2024 · 084bf0f · 084bf0f
1 parent 84ccb68
commit 084bf0f
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 12 deletions.
diff --git a/channel/gewechat/gewechat_channel.py b/channel/gewechat/gewechat_channel.py
@@ -13,6 +13,7 @@
 from common.utils import compress_imgfile, fsize
 from config import conf, save_config
 from lib.gewechat import GewechatClient
+from voice.audio_convert import mp3_to_silk
 
 MAX_UTF8_LEN = 2048
 
@@ -113,8 +114,19 @@ def send(self, reply: Reply, context: Context):
             logger.info("[gewechat] Do send text to {}: {}".format(receiver, reply_text))
         elif reply.type == ReplyType.VOICE:
             try:
-                # TODO: mp3 to silk
                 content = reply.content
+                if content.endswith('.mp3'):
+                    # 如果是mp3文件，转换为silk格式
+                    silk_path = content + '.silk'
+                    silk_path = mp3_to_silk(content, silk_path)
+                    callback_url = conf().get("gewechat_callback_url")
+                    silk_url = callback_url + "?file=" + silk_path
+                    # TODO: 计算silk文件语音时长，暂时都设置为5s
+                    self.client.post_voice(self.app_id, receiver, silk_url, 5000)
+                    logger.info("[gewechat] Do send voice to {}: {}".format(receiver, silk_url))
+                    return
+                else:
+                    logger.error(f"[gewechat] voice file is not mp3, path: {content}, only support mp3")
             except Exception as e:
                 logger.error(f"[gewechat] send voice failed: {e}")
         elif reply.type == ReplyType.IMAGE_URL:
@@ -136,10 +148,13 @@ class Query:
     def GET(self):
         params = web.input(file="")
         if params.file:
-            if os.path.exists(params.file):
-                with open(params.file, 'rb') as f:
+            # TODO: 只允许访问tmp目录下的文件
+            # raise web.forbidden()
+            if os.path.exists(file_path):
+                with open(file_path, 'rb') as f:
                     return f.read()
             else:
+                logger.error(f"[gewechat] File not found: {file_path}")
                 raise web.notfound()
         return "gewechat callback server is running"
 

diff --git a/requirements-optional.txt b/requirements-optional.txt
@@ -11,6 +11,7 @@ edge-tts # edge-tts
 numpy<=1.24.2
 langid # language detect
 elevenlabs==1.0.3 # elevenlabs TTS
+pilk==0.2.4 # process wechat silk file
 
 #install plugin
 dulwich

diff --git a/voice/audio_convert.py b/voice/audio_convert.py
@@ -1,3 +1,4 @@
+import os
 import shutil
 import wave
 
@@ -10,6 +11,11 @@
 
 from pydub import AudioSegment
 
+try:
+    import pilk
+except ImportError:
+    logger.warning("import pilk failed, silk voice conversion will not be supported. Try: pip install pilk")
+
 sil_supports = [8000, 12000, 16000, 24000, 32000, 44100, 48000]  # slk转wav时，支持的采样率
 
 
@@ -43,15 +49,43 @@ def get_pcm_from_wav(wav_path):
 def any_to_mp3(any_path, mp3_path):
     """
     把任意格式转成mp3文件
-    """
-    if any_path.endswith(".mp3"):
-        shutil.copy2(any_path, mp3_path)
-        return
-    if any_path.endswith(".sil") or any_path.endswith(".silk") or any_path.endswith(".slk"):
-        sil_to_wav(any_path, any_path)
-        any_path = mp3_path
-    audio = AudioSegment.from_file(any_path)
-    audio.export(mp3_path, format="mp3")
+    
+    Args:
+        any_path: 输入文件路径
+        mp3_path: 输出的mp3文件路径
+    """
+    try:
+        # 如果已经是mp3格式，直接复制
+        if any_path.endswith(".mp3"):
+            shutil.copy2(any_path, mp3_path)
+            return
+
+        # 如果是silk格式，使用pilk转换
+        if any_path.endswith((".sil", ".silk", ".slk")):
+            # 先转成PCM
+            pcm_path = any_path + '.pcm'
+            pilk.decode(any_path, pcm_path)
+
+            # 再用pydub把PCM转成MP3
+            # TODO: 下面的参数可能需要调整
+            audio = AudioSegment.from_raw(pcm_path, format="raw", 
+                                        frame_rate=24000,
+                                        channels=1,
+                                        sample_width=2)  # 16-bit PCM = 2 bytes
+            audio.export(mp3_path, format="mp3")
+
+            # 清理临时PCM文件
+            import os
+            os.remove(pcm_path)
+            return
+
+        # 其他格式使用pydub转换
+        audio = AudioSegment.from_file(any_path)
+        audio.export(mp3_path, format="mp3")
+
+    except Exception as e:
+        logger.error(f"转换文件到mp3失败: {str(e)}")
+        raise
 
 
 def any_to_wav(any_path, wav_path):
@@ -87,6 +121,32 @@ def any_to_sil(any_path, sil_path):
         f.write(silk_data)
     return audio.duration_seconds * 1000
 
+def mp3_to_silk(mp3_path: str, silk_path: str) -> str:
+    """Convert MP3 file to SILK format
+    Args:
+        mp3_path: Path to input MP3 file
+        silk_path: Path to output SILK file
+    Returns:
+        Path to output SILK file
+    """
+    # First load the MP3 file
+    audio = AudioSegment.from_file(mp3_path)
+
+    # Convert to mono and set sample rate to 24000Hz
+    # TODO: 下面的参数可能需要调整
+    audio = audio.set_channels(1)
+    audio = audio.set_frame_rate(24000)
+
+    # Export to PCM
+    pcm_path = os.path.splitext(mp3_path)[0] + '.pcm'
+    audio.export(pcm_path, format='s16le')
+
+    # Convert PCM to SILK
+    pilk.encode(pcm_path, silk_path, pcm_rate=24000, tencent=True)
+
+    # Clean up temporary PCM file
+    os.remove(pcm_path)
+    return silk_path
 
 def any_to_amr(any_path, amr_path):
     """