From 0644bfa5a3f9fcf8b4e9c818e3e95ec842fb6c00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cweifeng2333=E2=80=9D?= <“2715673327@qq.com”>
Date: Wed, 13 Nov 2024 00:24:38 +0800
Subject: [PATCH] update readme
---
README.md | 6 +--
app/core/alingner_wag1.py | 110 --------------------------------------
2 files changed, 3 insertions(+), 113 deletions(-)
delete mode 100644 app/core/alingner_wag1.py
diff --git a/README.md b/README.md
index 22d0edd..2df8ee6 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,14 @@
+
卡卡字幕助手
VideoCaptioner
一款基于大语言模型(LLM)的智能视频字幕处理助手,支持字幕生成、断句、优化、翻译全流程处理
-
- 简体中文 / [English](./docs/README_EN.md)
+
简体中文 / [English](./docs/README_EN.md)
## 📖 项目介绍
-VideoCaptioner 是一款功能强大的视频字幕配制软件。操作简单且无需高配置,利用大语言模型进行字幕智能断句、校正、优化、翻译,一键为视频配上效果惊艳的字幕。
+卡卡字幕助手(VideoCaptioner)是一款功能强大的视频字幕配制软件。操作简单且无需高配置,利用大语言模型进行字幕智能断句、校正、优化、翻译,一键为视频配上效果惊艳的字幕。
- 🎯 无需GPU即可使用强大的语音识别引擎,自动生成精准字幕
- ✂️ 基于 LLM 的智能分割与断句,字幕阅读更自然流畅
diff --git a/app/core/alingner_wag1.py b/app/core/alingner_wag1.py
deleted file mode 100644
index 041f0fa..0000000
--- a/app/core/alingner_wag1.py
+++ /dev/null
@@ -1,110 +0,0 @@
-from rapidfuzz import fuzz
-
-
-def align_texts_sequence_alignment(text1, text2, gap_penalty=100):
- """
- 使用序列比对算法(如 Needleman-Wunsch)对两个文本列表进行对齐,
- 不改变原始列表的内容,只是在对齐过程中可能插入空字符串。
-
- Args:
- text1 (list): 源文本列表,保持不变。
- text2 (list): 目标文本列表,保持不变。
- gap_penalty (int): 插入或删除操作的固定惩罚分数。
-
- Returns:
- tuple: (aligned_source, aligned_target)
- - aligned_source (list): 对齐后的源文本列表。
- - aligned_target (list): 对齐后的目标文本列表。
- """
- n = len(text1)
- m = len(text2)
-
- # 初始化得分矩阵和方向矩阵
- score = [[0] * (m + 1) for _ in range(n + 1)]
- direction = [[None] * (m + 1) for _ in range(n + 1)]
-
- # 初始化第一列和第一行
- for i in range(1, n + 1):
- score[i][0] = score[i - 1][0] - gap_penalty
- direction[i][0] = 'up' # 从上方来,表示插入空的目标句子
- for j in range(1, m + 1):
- score[0][j] = score[0][j - 1] - gap_penalty
- direction[0][j] = 'left' # 从左方来,表示插入空的源句子
-
- # 填充得分矩阵和方向矩阵
- for i in range(1, n + 1):
- for j in range(1, m + 1):
- # 计算匹配得分
- sim = fuzz.ratio(text1[i - 1], text2[j - 1])
- match_score = score[i - 1][j - 1] + sim # 相似度越高,得分越高
-
- # 计算插入和删除得分
- delete_score = score[i - 1][j] - gap_penalty # 删除 text1 中的句子
- insert_score = score[i][j - 1] - gap_penalty # 插入 text2 中的句子
-
- # 选择得分最高的操作
- max_score = max(match_score, delete_score, insert_score)
- score[i][j] = max_score
-
- # 记录方向
- if max_score == match_score:
- direction[i][j] = 'diag' # 来自左上方,表示匹配或替换
- elif max_score == delete_score:
- direction[i][j] = 'up' # 来自上方,表示删除(text2 中插入空行)
- else:
- direction[i][j] = 'left' # 来自左方,表示插入(text1 中插入空行)
-
- # 回溯得到对齐结果
- aligned_source = []
- aligned_target = []
- i, j = n, m
- while i > 0 or j > 0:
- dir = direction[i][j]
- if dir == 'diag':
- aligned_source.insert(0, text1[i - 1])
- aligned_target.insert(0, text2[j - 1])
- i -= 1
- j -= 1
- elif dir == 'up':
- aligned_source.insert(0, text1[i - 1])
- aligned_target.insert(0, '') # 插入空的目标句子
- i -= 1
- elif dir == 'left':
- aligned_source.insert(0, '') # 插入空的源句子
- aligned_target.insert(0, text2[j - 1])
- j -= 1
- else:
- break # 回溯结束
-
- return aligned_source, aligned_target
-
-
-if __name__ == '__main__':
- text1 = ['yep human hair be about that thick', "yep and that's a really really tiny LED",
- 'uvleds could be used to sterilize surfaces', 'like in hospitals or kitchens', 'just flick on the UV',
- 'lights and pathogens would be dead in seconds', 'copy 19 or you know',
- "UV LED companies stop pressing like it's kind of better because",
- "everything's very good in these UV LEDs", 'you can start at all the covid 19',
- 'for anything there we use aluminium gardenizer them', 'for UB we use aluminium gardenizer',
- 'okay the Bam Jap is much bigger', "do you think this is what's coming", "it's okay to work",
- 'but the problem the cost costs are too high changes', 'this is not thin passing', 'the cost is very high',
- 'okay if the infinishing program', 'on a shifty pass closely is almost comparable']
- text2 = ['Yep, human hair is about that thick',
- "Yep, and that's a really tiny LEDUV LEDs could be used to sterilize surface",
- 'Like in hospitals or kitchens', 'Just flick on the UV lights and pathogens would be dead in seconds',
- 'COVID-19 or you know', "UV LED companies are improving, it's kind of better because",
- "everything's very good in these UV LEDs", 'You can start with all the COVID-19 precautions',
- 'For everything, we use aluminum ganizers', 'For UV, we use aluminum ganizers',
- 'Okay, the bigger one is much better', "Do you think this is what's coming?", "It's okay to work",
- 'But the problem is the costs are too high', 'This is not a thin pass', 'The costs are very high',
- 'Okay, if the finishing program', 'on a shifty pass closely is almost comparable']
-
- aligned_source, aligned_target = align_texts_sequence_alignment(text1, text2)
-
- for idx, (s, t) in enumerate(zip(aligned_source, aligned_target)):
- print(f"行 {idx + 1}:")
- print(f"文本1: {s}")
- print(f"文本2: {t}")
- sim = fuzz.ratio(s, t) if s and t else 0
- print(f"相似度: {sim:.2f}")
- print('----')