{"id":4106,"date":"2024-08-17T02:47:54","date_gmt":"2024-08-16T18:47:54","guid":{"rendered":"https:\/\/www.aisharenet.com\/?p=4106"},"modified":"2025-01-19T21:26:46","modified_gmt":"2025-01-19T13:26:46","slug":"insanely-fast-whisper","status":"publish","type":"post","link":"https:\/\/www.kdjingpai.com\/ja\/insanely-fast-whisper\/","title":{"rendered":"Insanely Fast Whisper\uff1a\u5feb\u901f\u9ad8\u6548\u7684\u8f6c\u5f55\u8bed\u97f3\u4e3a\u6587\u672c\u7684\u5f00\u6e90\u9879\u76ee"},"content":{"rendered":"<p>insanely-fast-whisper\u662f\u4e00\u4e2a\u7ed3\u5408\u4e86OpenAI\u7684Whisper\u6a21\u578b\u548c\u5404\u79cd\u4f18\u5316\u6280\u672f\uff08\u5982Transformers, Optimum, Flash Attention\uff09\u7684\u97f3\u9891\u8f6c\u5f55\u5de5\u5177\uff0c\u63d0\u4f9b\u4e86\u547d\u4ee4\u884c\u754c\u9762\uff08CLI\uff09\uff0c\u65e8\u5728\u5feb\u901f\u9ad8\u6548\u5730\u8f6c\u5f55\u5927\u91cf\u97f3\u9891\u3002\u5b83\u4f7f\u7528Whisper Large v3\u6a21\u578b\uff0c\u80fd\u591f\u5728\u4e0d\u523098\u79d2\u7684\u65f6\u95f4\u5185\u8f6c\u5f55150\u5206\u949f\u7684\u97f3\u9891\u5185\u5bb9\u3002\u7528\u6237\u53ef\u4ee5\u901a\u8fc7GitHub\u4ed3\u5e93\u4e86\u89e3\u66f4\u591a\u8be6\u60c5\u3001\u5b89\u88c5\u6307\u5357\u548c\u4f7f\u7528\u5e2e\u52a9\u3002<\/p>\n<p>&nbsp;<\/p>\n<blockquote><p><strong>\u591a\u53d1\u8a00\u4eba\u8bc6\u522b<\/strong><\/p>\n<p>pyannote.audio\u662f\u4e00\u4e2a\u7528Python\u7f16\u5199\u7684\u7528\u4e8e\u626c\u58f0\u5668diarization\u7684\u5f00\u6e90\u5de5\u5177\u5305\u3002\u57fa\u4e8ePyTorch\u673a\u5668\u5b66\u4e60\u6846\u67b6\uff0c\u5b83\u5177\u6709\u6700\u5148\u8fdb\u7684\u9884\u8bad\u7ec3\u6a21\u578b\u548c\u7ba1\u9053\uff0c\u53ef\u4ee5\u8fdb\u4e00\u6b65\u5bf9\u81ea\u5df1\u7684\u6570\u636e\u8fdb\u884c\u5fae\u8c03\uff0c\u4ee5\u83b7\u5f97\u66f4\u597d\u7684\u6027\u80fd\u3002<\/p>\n<p>faster-whisper + pyannote.audio \u5b9e\u73b0\u8bed\u8005\u8bc6\u522b\uff0c\u5b9e\u9645\u4e0a\u53ea\u8981\u5c06\u4e8c\u8005\u7684\u8bc6\u522b\u7ed3\u679c\u8fdb\u884c\u7ed3\u5408\u5373\u53ef<\/p>\n<p>\u5b98\u65b9\u4ed3\u5e93\uff1ahttps:\/\/github.com\/pyannote\/pyannote-audio<\/p><\/blockquote>\n<p>&nbsp;<\/p>\n<h2>\u529f\u80fd\u5217\u8868<\/h2>\n<p>\u4f7f\u7528Whisper Large v3\u6a21\u578b\u8fdb\u884c\u97f3\u9891\u8f6c\u5f55<br \/>\n\u91c7\u7528Transformers, Optimum, Flash Attention\u7b49\u6280\u672f<br \/>\n\u63d0\u4f9bCLI\u754c\u9762<br \/>\n\u652f\u6301\u4e0d\u540c\u7684\u4f18\u5316\u7c7b\u578b\u5e76\u5c55\u793a\u57fa\u51c6\u6d4b\u8bd5<\/p>\n<p>&nbsp;<\/p>\n<h2>\u4f7f\u7528\u5e2e\u52a9<\/h2>\n<p>\u5b89\u88c5: \u5229\u7528pip\u8fdb\u884c\u5b89\u88c5\u548c\u914d\u7f6e<br \/>\n\u4f7f\u7528: \u76f4\u63a5\u901a\u8fc7\u547d\u4ee4\u884c\u4f20\u9012\u53c2\u6570\u5e76\u8fd0\u884c\u8f6c\u5f55\u4efb\u52a1<br \/>\n\u83b7\u53d6\u5e2e\u52a9: \u8bbf\u95eeGitHub\u4ed3\u5e93\u9605\u8bfb\u6587\u6863\u548c\u793e\u533a\u4ea4\u6d41<\/p>\n<p>&nbsp;<\/p>\n<p><strong>https:\/\/github.com\/SYSTRAN\/faster-whisper\u9879\u76ee\u7f16\u5199\u7684<a href=\"https:\/\/colab.research.google.com\/drive\/1ofxa1jqDj45VMslrbUXYxd3it_umXxIV?usp=sharing\">google colab\u4ee3\u7801<\/a><\/strong><\/p>\n<blockquote>\n<div>\n<div># \u5b89\u88c5\u5fc5\u8981\u7684\u5e93<\/div>\n<div>get_ipython().system(&#8216;pip install faster-whisper&#8217;)<\/div>\n<div># \u5bfc\u5165\u5fc5\u8981\u7684\u5e93<\/div>\n<div>from faster_whisper import available_models<\/div>\n<div>import torch<\/div>\n<div>import ipywidgets as widgets<\/div>\n<div>from IPython.display import display, clear_output<\/div>\n<div>import os \u00a0# \u5bfc\u5165\u64cd\u4f5c\u7cfb\u7edf\u5e93\uff0c\u7528\u4e8e\u5904\u7406\u6587\u4ef6\u64cd\u4f5c<\/div>\n<div>import gc \u00a0# \u5bfc\u5165\u5783\u573e\u56de\u6536\u5e93<\/div>\n<div># \u81ea\u52a8\u68c0\u6d4b\u8bbe\u5907\u7c7b\u578b\u5e76\u9009\u62e9GPU\u6216CPU<\/div>\n<div>device = &#8220;cuda&#8221; if torch.cuda.is_available() else &#8220;cpu&#8221;<\/div>\n<div>model_size = &#8220;large-v2&#8221; \u00a0# \u9ed8\u8ba4\u9009\u62e9\u6a21\u578b\u5927\u5c0f<\/div>\n<div>compute_type = &#8220;float16&#8221; if device == &#8220;cuda&#8221; else &#8220;float32&#8221; \u00a0# \u5982\u679c\u4f7f\u7528CPU\uff0c\u5219\u5207\u6362\u5230float32<\/div>\n<div># \u83b7\u53d6\u53ef\u7528\u6a21\u578b\u7684\u5217\u8868<\/div>\n<div>models_list = available_models()<\/div>\n<div># \u9ed8\u8ba4\u8bed\u8a00\u5217\u8868<\/div>\n<div>supported_languages = [&#8216;en&#8217;, &#8216;fr&#8217;, &#8216;de&#8217;, &#8216;zh&#8217;, &#8216;&#8230;&#8217;] \u00a0# \u4f7f\u7528\u9ed8\u8ba4\u7684\u8bed\u8a00\u5217\u8868<\/div>\n<div>default_language = &#8216;zh&#8217; if &#8216;zh&#8217; in supported_languages else supported_languages[0] \u00a0# \u5982\u679c\u5217\u8868\u4e2d\u6709&#8217;zh&#8217;\uff0c\u5219\u4f7f\u7528\u5b83\u4f5c\u4e3a\u9ed8\u8ba4\u503c\uff1b\u5426\u5219\u4f7f\u7528\u5217\u8868\u4e2d\u7684\u7b2c\u4e00\u4e2a\u503c<\/div>\n<\/div>\n<p>&nbsp;<\/p>\n<div>\n<div># \u521b\u5efaGUI\u754c\u9762<\/div>\n<div>model_label = widgets.Label(&#8216;\u9009\u62e9\u6a21\u578b:&#8217;)<\/div>\n<div>model_dropdown = widgets.Dropdown(options=models_list, value=model_size)<\/div>\n<div>language_label = widgets.Label(&#8216;\u8bed\u8a00:&#8217;)<\/div>\n<div>language_dropdown = widgets.Dropdown(options=supported_languages, value=default_language)<\/div>\n<div>beam_size_label = widgets.Label(&#8216;Beam\u5927\u5c0f:&#8217;)<\/div>\n<div>beam_size_slider = widgets.IntSlider(value=5, min=1, max=10, step=1)<\/div>\n<div>compute_type_label = widgets.Label(&#8216;\u8ba1\u7b97\u7c7b\u578b:&#8217;)<\/div>\n<div>if device == &#8220;cuda&#8221;:<\/div>\n<div>\u00a0 \u00a0 compute_type_options = [&#8216;float16&#8217;, &#8216;int8&#8217;]<\/div>\n<div>else:<\/div>\n<div>\u00a0 \u00a0 compute_type_options = [&#8216;float32&#8217;] \u00a0# \u5982\u679c\u662fCPU\uff0c\u5219\u9501\u5b9a\u4e3afloat32<\/div>\n<div>compute_type_dropdown = widgets.Dropdown(options=compute_type_options, value=compute_type)<\/div>\n<div>mode_label = widgets.Label(&#8216;Format Mode:&#8217;)<\/div>\n<div>mode_dropdown = widgets.Dropdown(options=[&#8216;normal&#8217;, &#8216;timeline&#8217;, &#8216;subtitle&#8217;], value=&#8217;normal&#8217;)<\/div>\n<div>initial_prompt_label = widgets.Label(&#8216;\u521d\u59cb\u63d0\u793a:&#8217;) \u00a0# \u65b0\u589e\u7684\u521d\u59cb\u63d0\u793a\u6807\u7b7e<\/div>\n<div>initial_prompt_text = widgets.Text(value=&#8221;) \u00a0# \u65b0\u589e\u7684\u521d\u59cb\u63d0\u793a\u8f93\u5165\u6846<\/div>\n<div>file_name_text = widgets.Text(description=&#8217;\u6587\u4ef6\u540d:&#8217;, value=&#8217;\/content\/&#8217;) \u00a0# \u5141\u8bb8\u7528\u6237\u8f93\u5165\u6587\u4ef6\u540d<\/div>\n<div>transcribe_button = widgets.Button(description=&#8217;\u8f6c\u8bd1&#8217;)<\/div>\n<div>output_area = widgets.Output()<\/div>\n<\/div>\n<p>&nbsp;<\/p>\n<div>\n<div># \u5b9a\u4e49\u8f6c\u8bd1\u51fd\u6570<\/div>\n<div>def transcribe_audio(b):<\/div>\n<div>\u00a0 \u00a0 with output_area:<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 clear_output()<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 print(&#8220;\u5f00\u59cb\u8f6c\u5f55&#8230;&#8221;)<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 from faster_whisper import WhisperModel \u00a0# \u52a8\u6001\u5bfc\u5165WhisperModel\uff1a\u5728\u9700\u8981\u65f6\u5bfc\u5165\u4ee5\u8282\u7701RAM<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 try:<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 file_name = file_name_text.value \u00a0# \u4f7f\u7528\u7528\u6237\u8f93\u5165\u7684\u6587\u4ef6\u540d<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 initial_prompt = initial_prompt_text.value \u00a0# \u4f7f\u7528\u7528\u6237\u8f93\u5165\u7684\u521d\u59cb\u63d0\u793a<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 # \u786e\u4fdd\u6587\u4ef6\u5b58\u5728<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 if not os.path.exists(file_name):<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 print(f&#8221;\u6587\u4ef6 {file_name} \u4e0d\u5b58\u5728\uff0c\u8bf7\u68c0\u67e5\u6587\u4ef6\u540d\u548c\u8def\u5f84\u662f\u5426\u6b63\u786e\u3002&#8221;)<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 return<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 # \u83b7\u53d6\u9009\u53d6\u7684\u6a21\u578b<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 selected_model = model_dropdown.value<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 selected_compute_type = compute_type_dropdown.value<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 selected_language = language_dropdown.value<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 # \u521b\u5efa\u65b0\u7684\u6a21\u578b\u5b9e\u4f8b\u5e76\u505a\u8f6c\u8bd1<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 model = WhisperModel(selected_model, device=device, compute_type=selected_compute_type)<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 try:<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 # \u8f6c\u8bd1\u97f3\u9891<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 segments, info = model.transcribe(file_name, beam_size=beam_size_slider.value, language=selected_language, initial_prompt=initial_prompt) \u00a0# \u65b0\u589e\u7684\u521d\u59cb\u63d0\u793a\u53c2\u6570<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 # \u6253\u5370\u7ed3\u679c<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 print(&#8220;Detected language &#8216;%s&#8217; with probability %f&#8221; % (info.language, info.language_probability))<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 for segment in segments:<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 if mode_dropdown.value == &#8216;normal&#8217;:<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 print(&#8220;%s &#8221; % (segment.text))<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 elif mode_dropdown.value == &#8216;timeline&#8217;:<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 print(&#8220;[%.2fs -&gt; %.2fs] %s&#8221; % (segment.start, segment.end, segment.text))<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 else: \u00a0# subtitle<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 start_time = &#8220;{:02d}:{:02d}:{:02d},{:03d}&#8221;.format(int(segment.start \/\/ 3600), int((segment.start % 3600) \/\/ 60), int(segment.start % 60), int((segment.start % 1) * 1000))<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 end_time = &#8220;{:02d}:{:02d}:{:02d},{:03d}&#8221;.format(int(segment.end \/\/ 3600), int((segment.end % 3600) \/\/ 60), int(segment.end % 60), int((segment.end % 1) * 1000))<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 print(&#8220;%d\\n%s &#8211;&gt; %s\\n%s\\n&#8221; % (segment.id, start_time, end_time, segment.text))<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 finally:<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 # \u5220\u9664\u6a21\u578b\u5b9e\u4f8b\u4ee5\u91ca\u653eRAM<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 del model<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 except Exception as e:<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 print(&#8220;An error occurred during transcription:&#8221;)<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 print(str(e))<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 finally:<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 # \u8c03\u7528\u5783\u573e\u56de\u6536<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 gc.collect()<\/div>\n<div>\u00a0 \u00a0 \u00a0 \u00a0 print(&#8220;\u8f6c\u5f55\u5b8c\u6210\u3002&#8221;)<\/div>\n<\/div>\n<p>&nbsp;<\/p>\n<div>\n<div># \u7ec4\u88c5GUI\u754c\u9762<\/div>\n<div>display(model_label, model_dropdown, language_label, language_dropdown, beam_size_label, beam_size_slider, compute_type_label, compute_type_dropdown, mode_label, mode_dropdown, initial_prompt_label, initial_prompt_text, file_name_text, transcribe_button, output_area)<\/div>\n<div>transcribe_button.on_click(transcribe_audio)<\/div>\n<\/div>\n<\/blockquote>\n<div><\/div>\n<div>\u8bc6\u522b\u591a\u53d1\u8a00\u4eba\u4ee3\u7801\u793a\u4f8b<\/div>\n<blockquote>\n<div>\n<p>from pyannote.core import Segment<\/p>\n<p>def get_text_with_timestamp(transcribe_res):<br \/>\ntimestamp_texts = []<br \/>\nfor item in transcribe_res:<br \/>\nstart = item.start<br \/>\nend = item.end<br \/>\ntext = item.text.strip()<br \/>\ntimestamp_texts.append((Segment(start, end), text))<br \/>\nreturn timestamp_texts<\/p>\n<p>def add_speaker_info_to_text(timestamp_texts, ann):<br \/>\nspk_text = []<br \/>\nfor seg, text in timestamp_texts:<br \/>\nspk = ann.crop(seg).argmax()<br \/>\nspk_text.append((seg, spk, text))<br \/>\nreturn spk_text<\/p>\n<p>def merge_cache(text_cache):<br \/>\nsentence = &#8221;.join([item[-1] for item in text_cache])<br \/>\nspk = text_cache[0][1]<br \/>\nstart = round(text_cache[0][0].start, 1)<br \/>\nend = round(text_cache[-1][0].end, 1)<br \/>\nreturn Segment(start, end), spk, sentence<\/p>\n<p>PUNC_SENT_END = [&#8216;,&#8217;, &#8216;.&#8217;, &#8216;?&#8217;, &#8216;!&#8217;, &#8220;\uff0c&#8221;, &#8220;\u3002&#8221;, &#8220;\uff1f&#8221;, &#8220;\uff01&#8221;]<\/p>\n<p>def merge_sentence(spk_text):<br \/>\nmerged_spk_text = []<br \/>\npre_spk = None<br \/>\ntext_cache = []<br \/>\nfor seg, spk, text in spk_text:<br \/>\nif spk != pre_spk and pre_spk is not None and len(text_cache) &gt; 0:<br \/>\nmerged_spk_text.append(merge_cache(text_cache))<br \/>\ntext_cache = [(seg, spk, text)]<br \/>\npre_spk = spk<\/p>\n<p>elif text and len(text) &gt; 0 and text[-1] in PUNC_SENT_END:<br \/>\ntext_cache.append((seg, spk, text))<br \/>\nmerged_spk_text.append(merge_cache(text_cache))<br \/>\ntext_cache = []<br \/>\npre_spk = spk<br \/>\nelse:<br \/>\ntext_cache.append((seg, spk, text))<br \/>\npre_spk = spk<br \/>\nif len(text_cache) &gt; 0:<br \/>\nmerged_spk_text.append(merge_cache(text_cache))<br \/>\nreturn merged_spk_text<\/p>\n<p>def diarize_text(transcribe_res, diarization_result):<br \/>\ntimestamp_texts = get_text_with_timestamp(transcribe_res)<br \/>\nspk_text = add_speaker_info_to_text(timestamp_texts, diarization_result)<br \/>\nres_processed = merge_sentence(spk_text)<br \/>\nreturn res_processed<\/p>\n<p>def write_to_txt(spk_sent, file):<br \/>\nwith open(file, &#8216;w&#8217;) as fp:<br \/>\nfor seg, spk, sentence in spk_sent:<br \/>\nline = f'{seg.start:.2f} {seg.end:.2f} {spk} {sentence}\\n&#8217;<br \/>\nfp.write(line)<\/p>\n<p>&nbsp;<\/p>\n<p>import torch<br \/>\nimport <a href=\"https:\/\/www.kdjingpai.com\/de\/crisperwhisper\/\">whisper<\/a><br \/>\nimport numpy as np<br \/>\nfrom pydub import AudioSegment<br \/>\nfrom loguru import logger<br \/>\nfrom faster_whisper import WhisperModel<br \/>\nfrom pyannote.audio import Pipeline<br \/>\nfrom pyannote.audio import Audio<\/p>\n<p>from common.error import ErrorCode<\/p>\n<p>model_path = config[&#8220;asr&#8221;][&#8220;faster-whisper-large-v3&#8221;]<\/p>\n<p># \u6d4b\u8bd5\u97f3\u9891\uff1a https:\/\/isv-data.oss-cn-hangzhou.aliyuncs.com\/ics\/MaaS\/ASR\/test_audio\/asr_speaker_demo.wav<br \/>\naudio = &#8220;.\/test\/asr\/data\/asr_speaker_demo.wav&#8221;<br \/>\nasr_model = WhisperModel(model_path, device=&#8221;cuda&#8221;, compute_type=&#8221;float16&#8243;)<br \/>\nspk_rec_pipeline = Pipeline.from_pretrained(&#8220;pyannote\/speaker-diarization-3.1&#8243;, use_auth_token=&#8221;your huggingface token&#8221;)<br \/>\nspk_rec_pipeline.to(torch.device(&#8220;cuda&#8221;))<\/p>\n<p>asr_result, info = asr_model.transcribe(audio, language=&#8221;zh&#8221;, beam_size=5)<br \/>\ndiarization_result = spk_rec_pipeline(audio)<\/p>\n<p>final_result = diarize_text(asr_result, diarization_result)<br \/>\nfor segment, spk, sent in final_result:<br \/>\nprint(&#8220;[%.2fs -&gt; %.2fs] %s %s&#8221; % (segment.start, segment.end, sent, spk))<\/p>\n<\/div>\n<\/blockquote>\n<p>&nbsp;<\/p>\n<h2>\u76f8\u5173\u8d44\u6e90<\/h2>\n<p>\u4e3b\u9879\u76ee\uff1ahttps:\/\/github.com\/SYSTRAN\/faster-whisper<\/p>\n<p>\u7b80\u6613\u4ee3\u7801\uff1ahttps:\/\/www.letswrite.tw\/colab-faster-whisper\/<\/p>\n<p>\u6cb9\u7ba1\u89c6\u9891\u8f6c\u5b57\u5e55\uff1ahttps:\/\/github.com\/lewangdev\/faster-whisper-youtube<\/p>\n<p>Fast Whisper \u5b9e\u65f6\u8bed\u97f3\u8f6c\u5f55\uff1ahttps:\/\/www.kaggle.com\/code\/xiu0714\/faster-whisper<\/p>\n<p>&nbsp;<\/p>\n<p><!--wechatfans end--><\/p>\n<h2>\u4e00\u952e\u5b89\u88c5\u5305<\/h2>\n<p><div class=\"huoduan_hide_box\" style=\"border:1px dashed #F60; padding:10px; margin:10px 0; line-height:200%; color:#F00; background-color:#FFF4FF; overflow:hidden; clear:both;\"><img loading=\"lazy\" decoding=\"async\" class=\"wxpic\" align=\"right\" src=\"https:\/\/www.kdjingpai.com\/wp-content\/uploads\/2025\/05\/d8668ed8023fbe2.jpg\" style=\"width:150px;height:150px;margin-left:20px;display:inline;border:none\" width=\"150\" height=\"150\"  alt=\"AI\u751f\u4ea7\u529b\u5e94\u7528\" \/><span style=\"font-size:18px;\">\u6b64\u5904\u5185\u5bb9\u5df2\u7ecf\u88ab\u4f5c\u8005\u9690\u85cf\uff0c\u8bf7\u8f93\u5165\u9a8c\u8bc1\u7801\u67e5\u770b\u5185\u5bb9<\/span><form method=\"post\" style=\"margin:10px 0;\"><span class=\"yzts\" style=\"font-size:18px;float:left;\">\u9a8c\u8bc1\u7801\uff1a<\/span><input name=\"huoduan_verifycode\" id=\"verifycode\" type=\"text\" value=\"\" style=\"border:none;float:left;width:80px; height:32px; line-height:30px; padding:0 5px; border:1px solid #FF6600;-moz-border-radius: 0px;  -webkit-border-radius: 0px;  border-radius:0px;\" \/><input id=\"verifybtn\" style=\"border:none;float:left;width:80px; height:32px; line-height:32px; padding:0 5px; background-color:#F60; text-align:center; border:none; cursor:pointer; color:#FFF;-moz-border-radius: 0px; font-size:14px;  -webkit-border-radius: 0px;  border-radius:0px;\" name=\"\" type=\"submit\" value=\"\u63d0\u4ea4\u67e5\u770b\" \/><\/form><div style=\"clear:left;\"><\/div><span style=\"color:#00BF30\">\u8bf7\u5173\u6ce8\u672c\u7ad9\u5fae\u4fe1\u516c\u4f17\u53f7\uff0c\u56de\u590d\u201c<span style=\"color:blue\">\u9a8c\u8bc1\u7801<\/span>\u201d\uff0c\u83b7\u53d6\u9a8c\u8bc1\u7801\u3002\u5728\u5fae\u4fe1\u91cc\u641c\u7d22\u201c<span style=\"color:blue\">AI\u751f\u4ea7\u529b\u5e94\u7528<\/span>\u201d\u6216\u8005\u201c<span style=\"color:blue\">Artificial9527<\/span>\u201d\u6216\u8005\u5fae\u4fe1\u626b\u63cf\u53f3\u4fa7\u4e8c\u7ef4\u7801\u90fd\u53ef\u4ee5\u5173\u6ce8\u672c\u7ad9\u5fae\u4fe1\u516c\u4f17\u53f7\u3002<\/span><div class=\"cl\"><\/div><\/div><\/p>\n","protected":false},"excerpt":{"rendered":"<p>insanely-fast-whisper\u662f\u4e00\u4e2a\u7ed3\u5408\u4e86OpenAI\u7684Whisper\u6a21\u578b\u548c\u5404\u79cd\u4f18\u5316\u6280\u672f\uff08\u5982Transformers, Optimum, Flash Attention\uff09\u7684\u97f3\u9891\u8f6c\u5f55\u5de5\u5177\uff0c\u63d0\u4f9b\u4e86\u547d\u4ee4\u884c\u754c\u9762\uff08CLI\uff09\uff0c\u65e8\u5728\u5feb\u901f\u9ad8\u6548\u5730&#8230;<\/p>\n","protected":false},"author":1,"featured_media":60898,"comment_status":"open","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[20],"tags":[230,216],"class_list":["post-4106","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-tool","tag-aikaiyuanxiangmu","tag-aiyuyinzhuanwenben"],"_links":{"self":[{"href":"https:\/\/www.kdjingpai.com\/ja\/wp-json\/wp\/v2\/posts\/4106","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.kdjingpai.com\/ja\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.kdjingpai.com\/ja\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.kdjingpai.com\/ja\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.kdjingpai.com\/ja\/wp-json\/wp\/v2\/comments?post=4106"}],"version-history":[{"count":0,"href":"https:\/\/www.kdjingpai.com\/ja\/wp-json\/wp\/v2\/posts\/4106\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.kdjingpai.com\/ja\/wp-json\/wp\/v2\/media\/60898"}],"wp:attachment":[{"href":"https:\/\/www.kdjingpai.com\/ja\/wp-json\/wp\/v2\/media?parent=4106"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.kdjingpai.com\/ja\/wp-json\/wp\/v2\/categories?post=4106"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.kdjingpai.com\/ja\/wp-json\/wp\/v2\/tags?post=4106"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}