Skip to content

Commit 17118d9

Browse files
liangweifengWEIFENG2333
authored andcommitted
refactor: 统一音频输入参数命名并优化相关类
- 将多个类中的 `audio_path` 参数重命名为 `audio_input`,以提高一致性和可读性。 - 更新相关文档和注释,确保参数说明与实际代码一致。 - 优化音频处理逻辑,确保在不同类中对音频输入的处理方式一致。 这些改动增强了代码的可维护性,减少了潜在的混淆。
1 parent 153cd14 commit 17118d9

17 files changed

+387
-123
lines changed

app/components/WhisperCppSettingWidget.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -527,9 +527,7 @@ def setup_ui(self):
527527
self.container.setStyleSheet("QWidget{background: transparent}")
528528
self.containerLayout = QVBoxLayout(self.container)
529529

530-
self.setting_group = SettingCardGroup(
531-
self.tr("Whisper CPP 设置(不稳定 🤔)"), self
532-
)
530+
self.setting_group = SettingCardGroup(self.tr("Whisper CPP 设置"), self)
533531

534532
# 模型选择
535533
self.model_card = ComboBoxSettingCard(

app/core/asr/base.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -36,18 +36,18 @@ class BaseASR:
3636

3737
def __init__(
3838
self,
39-
audio_path: Optional[Union[str, bytes]] = None,
39+
audio_input: Optional[Union[str, bytes]] = None,
4040
use_cache: bool = False,
4141
need_word_time_stamp: bool = False,
4242
):
4343
"""Initialize ASR with audio data.
4444
4545
Args:
46-
audio_path: Path to audio file or raw audio bytes
46+
audio_input: Path to audio file or raw audio bytes
4747
use_cache: Whether to cache recognition results
4848
need_word_time_stamp: Whether to return word-level timestamps
4949
"""
50-
self.audio_path = audio_path
50+
self.audio_input = audio_input
5151
self.file_binary = None
5252
self.use_cache = use_cache
5353
self._set_data()
@@ -56,25 +56,27 @@ def __init__(
5656

5757
def _set_data(self):
5858
"""Load audio data and compute CRC32 hash for cache key."""
59-
if isinstance(self.audio_path, bytes):
60-
self.file_binary = self.audio_path
61-
elif isinstance(self.audio_path, str):
62-
ext = self.audio_path.split(".")[-1].lower()
59+
if isinstance(self.audio_input, bytes):
60+
self.file_binary = self.audio_input
61+
elif isinstance(self.audio_input, str):
62+
ext = self.audio_input.split(".")[-1].lower()
6363
assert (
6464
ext in self.SUPPORTED_SOUND_FORMAT
6565
), f"Unsupported sound format: {ext}"
66-
assert os.path.exists(self.audio_path), f"File not found: {self.audio_path}"
67-
with open(self.audio_path, "rb") as f:
66+
assert os.path.exists(
67+
self.audio_input
68+
), f"File not found: {self.audio_input}"
69+
with open(self.audio_input, "rb") as f:
6870
self.file_binary = f.read()
6971
else:
70-
raise ValueError("audio_path must be provided as string or bytes")
72+
raise ValueError("audio_input must be provided as string or bytes")
7173
crc32_value = zlib.crc32(self.file_binary) & 0xFFFFFFFF
7274
self.crc32_hex = format(crc32_value, "08x")
7375

7476
def _get_audio_duration(self) -> float:
7577
"""Get audio duration in seconds using pydub."""
7678
if not self.file_binary:
77-
return 0.0
79+
return 0.01
7880
try:
7981
audio = AudioSegment.from_file(BytesIO(self.file_binary))
8082
return audio.duration_seconds

app/core/asr/bcut.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,11 @@ class BcutASR(BaseASR):
3030

3131
def __init__(
3232
self,
33-
audio_path: Union[str, bytes],
33+
audio_input: Union[str, bytes],
3434
use_cache: bool = True,
3535
need_word_time_stamp: bool = False,
3636
):
37-
super().__init__(audio_path, use_cache=use_cache)
37+
super().__init__(audio_input, use_cache=use_cache)
3838
self.session = requests.Session()
3939
self.task_id: Optional[str] = None
4040
self.__etags: List[str] = []

app/core/asr/faster_whisper.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import subprocess
66
import tempfile
77
from pathlib import Path
8-
from typing import Any, Callable, List, Optional
8+
from typing import Any, Callable, List, Optional, Union
99

1010
import GPUtil
1111

@@ -27,7 +27,7 @@ class FasterWhisperASR(BaseASR):
2727

2828
def __init__(
2929
self,
30-
audio_path: str,
30+
audio_input: Union[str, bytes],
3131
faster_whisper_program: str,
3232
whisper_model: str,
3333
model_dir: str,
@@ -52,7 +52,7 @@ def __init__(
5252
max_comma_cent: int = 50,
5353
prompt: Optional[str] = None,
5454
):
55-
super().__init__(audio_path, use_cache)
55+
super().__init__(audio_input, use_cache)
5656

5757
# 基本参数
5858
self.model_path = whisper_model
@@ -112,7 +112,7 @@ def __init__(
112112
)
113113
self.faster_whisper_program = "faster-whisper-xxl"
114114

115-
def _build_command(self, audio_path: str) -> List[str]:
115+
def _build_command(self, audio_input: str) -> List[str]:
116116
"""Build command line arguments for faster-whisper."""
117117

118118
cmd = [
@@ -129,7 +129,7 @@ def _build_command(self, audio_path: str) -> List[str]:
129129

130130
cmd.extend(
131131
[
132-
str(audio_path),
132+
str(audio_input),
133133
"-l",
134134
self.language,
135135
"-d",
@@ -204,7 +204,7 @@ def _build_command(self, audio_path: str) -> List[str]:
204204

205205
def _make_segments(self, resp_data: str) -> List[ASRDataSeg]:
206206
asr_data = ASRData.from_srt(resp_data)
207-
207+
208208
# 幻觉文本关键词列表
209209
hallucination_keywords = [
210210
"请不吝点赞 订阅 转发",
@@ -214,17 +214,17 @@ def _make_segments(self, resp_data: str) -> List[ASRDataSeg]:
214214
filtered_segments = []
215215
for seg in asr_data.segments:
216216
text = seg.text.strip()
217-
217+
218218
# 跳过音乐标记
219219
if text.startswith(("【", "[", "(", "(")):
220220
continue
221-
221+
222222
# 跳过包含幻觉关键词的文本
223223
if any(keyword in text for keyword in hallucination_keywords):
224224
continue
225-
225+
226226
filtered_segments.append(seg)
227-
227+
228228
return filtered_segments
229229

230230
def _run(
@@ -241,8 +241,8 @@ def _default_callback(x, y):
241241
wav_path = temp_dir / "audio.wav"
242242
output_path = wav_path.with_suffix(".srt")
243243

244-
if isinstance(self.audio_path, str):
245-
shutil.copy2(self.audio_path, wav_path)
244+
if isinstance(self.audio_input, str):
245+
shutil.copy2(self.audio_input, wav_path)
246246
else:
247247
if self.file_binary:
248248
wav_path.write_bytes(self.file_binary)

app/core/asr/jianying.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,14 @@ class JianYingASR(BaseASR):
2424

2525
def __init__(
2626
self,
27-
audio_path: Union[str, bytes],
27+
audio_input: Union[str, bytes],
2828
use_cache: bool = False,
2929
need_word_time_stamp: bool = False,
3030
start_time: float = 0,
3131
end_time: float = 6000,
3232
):
33-
super().__init__(audio_path, use_cache)
34-
self.audio_path = audio_path
33+
super().__init__(audio_input, use_cache)
34+
self.audio_input = audio_input
3535
self.end_time = end_time
3636
self.start_time = start_time
3737

@@ -221,10 +221,10 @@ def _upload_sign(self):
221221

222222
def _upload_auth(self):
223223
"""Get upload authorization"""
224-
if isinstance(self.audio_path, bytes):
225-
file_size = len(self.audio_path)
224+
if isinstance(self.audio_input, bytes):
225+
file_size = len(self.audio_input)
226226
else:
227-
file_size = os.path.getsize(self.audio_path)
227+
file_size = os.path.getsize(self.audio_input)
228228
request_parameters = f"Action=ApplyUploadInner&FileSize={file_size}&FileType=object&IsInner=1&SpaceName=lv-mac-recognition&Version=2020-11-19&s=5y0udbjapi"
229229

230230
t = datetime.datetime.utcnow()

app/core/asr/whisper_api.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Any, Callable, List, Optional
1+
from typing import Any, Callable, List, Optional, Union
22

33
from openai import OpenAI
44

@@ -19,7 +19,7 @@ class WhisperAPI(BaseASR):
1919

2020
def __init__(
2121
self,
22-
audio_path: str,
22+
audio_input: Union[str, bytes],
2323
whisper_model: str,
2424
need_word_time_stamp: bool = False,
2525
language: str = "zh",
@@ -31,7 +31,7 @@ def __init__(
3131
"""Initialize Whisper API.
3232
3333
Args:
34-
audio_path: Path to audio file
34+
audio_input: Path to audio file or raw audio bytes
3535
whisper_model: Model name
3636
need_word_time_stamp: Return word-level timestamps
3737
language: Language code (default: zh)
@@ -40,7 +40,7 @@ def __init__(
4040
api_key: API key
4141
use_cache: Enable caching
4242
"""
43-
super().__init__(audio_path, use_cache)
43+
super().__init__(audio_input, use_cache)
4444

4545
self.base_url = normalize_base_url(base_url)
4646
self.api_key = api_key.strip()

app/core/asr/whisper_cpp.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import tempfile
66
import time
77
from pathlib import Path
8-
from typing import Any, Callable, List, Optional
8+
from typing import Any, Callable, List, Optional, Union
99

1010
from ...config import MODEL_PATH
1111
from ..utils.logger import setup_logger
@@ -25,16 +25,20 @@ class WhisperCppASR(BaseASR):
2525

2626
def __init__(
2727
self,
28-
audio_path,
28+
audio_input: Union[str, bytes],
2929
language="en",
3030
whisper_cpp_path=None,
3131
whisper_model=None,
3232
use_cache: bool = False,
3333
need_word_time_stamp: bool = False,
3434
):
35-
super().__init__(audio_path, use_cache)
36-
assert os.path.exists(audio_path), f"Audio file not found: {audio_path}"
37-
assert audio_path.endswith(".wav"), f"Audio must be WAV format: {audio_path}"
35+
super().__init__(audio_input, use_cache)
36+
37+
if isinstance(audio_input, str):
38+
assert os.path.exists(audio_input), f"Audio file not found: {audio_input}"
39+
assert audio_input.endswith(
40+
".wav"
41+
), f"Audio must be WAV format: {audio_input}"
3842

3943
# Auto-detect whisper executable if not provided
4044
if whisper_cpp_path is None:
@@ -116,13 +120,13 @@ def _default_callback(_progress: int, _message: str) -> None:
116120

117121
with tempfile.TemporaryDirectory() as temp_path:
118122
temp_dir = Path(temp_path)
119-
wav_path = temp_dir / "audio.wav"
123+
wav_path = temp_dir / "whisper_cpp_audio.wav"
120124
output_path = wav_path.with_suffix(".srt")
121125

122126
try:
123127
# 复制音频文件
124-
if isinstance(self.audio_path, str):
125-
shutil.copy2(self.audio_path, wav_path)
128+
if isinstance(self.audio_input, str):
129+
shutil.copy2(self.audio_input, wav_path)
126130
else:
127131
if self.file_binary:
128132
wav_path.write_bytes(self.file_binary)
@@ -136,10 +140,7 @@ def _default_callback(_progress: int, _message: str) -> None:
136140
logger.info("Whisper.cpp command: %s", " ".join(whisper_params))
137141

138142
# Get audio duration
139-
if isinstance(self.audio_path, str):
140-
total_duration = self.get_audio_duration(self.audio_path)
141-
else:
142-
total_duration = 600
143+
total_duration = self.audio_duration
143144
logger.info("Audio duration: %d seconds", total_duration)
144145

145146
# Start process
@@ -272,7 +273,7 @@ def detect_whisper_executable() -> str:
272273
if __name__ == "__main__":
273274
# 简短示例
274275
asr = WhisperCppASR(
275-
audio_path="audio.mp3",
276+
audio_input="audio.mp3",
276277
whisper_model="tiny",
277278
whisper_cpp_path="bin/whisper-cpp.exe",
278279
language="en",

resource/translations/VideoCaptioner_en_US.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2499,7 +2499,7 @@ Note: When using small LLM models, it is recommended to keep the script within 1
24992499
<name>WhisperCppSettingWidget</name>
25002500
<message>
25012501
<location filename="../../app/components/WhisperCppSettingWidget.py" line="530" />
2502-
<source>Whisper CPP 设置(不稳定 🤔)</source>
2502+
<source>Whisper CPP 设置</source>
25032503
<translation>Whisper CPP Settings (unstable 🤔)</translation>
25042504
</message>
25052505
<message>

resource/translations/VideoCaptioner_zh_CN.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2482,7 +2482,7 @@
24822482
<name>WhisperCppSettingWidget</name>
24832483
<message>
24842484
<location filename="../../app/components/WhisperCppSettingWidget.py" line="530"/>
2485-
<source>Whisper CPP 设置(不稳定 🤔)</source>
2485+
<source>Whisper CPP 设置</source>
24862486
<translation type="unfinished"></translation>
24872487
</message>
24882488
<message>

resource/translations/VideoCaptioner_zh_HK.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2503,7 +2503,7 @@
25032503
<name>WhisperCppSettingWidget</name>
25042504
<message>
25052505
<location filename="../../app/components/WhisperCppSettingWidget.py" line="530"/>
2506-
<source>Whisper CPP 设置(不稳定 🤔)</source>
2506+
<source>Whisper CPP 设置</source>
25072507
<translation>Whisper CPP 設置(不穩定 🤔)</translation>
25082508
</message>
25092509
<message>

0 commit comments

Comments
 (0)