python使用whisper讀取藍牙耳機語音并轉為文字

更新時間：2025年05月08日 09:15:15 作者：waterHBO

這篇文章主要為大家詳細介紹了python如何使用whisper讀取藍牙耳機語音并識別轉為文字,文中的示例代碼講解詳細,感興趣的小伙伴可以了解下

1. 起因目的

看到別人做了類似的效果。所以自己也想試試看。動手。

2. 先看效果

3. 過程

我用的是藍牙耳機，EDIFIER W820NB

先找到聲音，設置為 Hands-Free 模式

代碼 1 ，查找設備名稱，看看哪個是能用的

我的設備，能用的是 index=27

import sounddevice as sd
import numpy as np
import wave
import re

def list_input_devices():
    print("?? 可用音頻輸入設備列表：")
    input_devices = []
    devices = sd.query_devices()
    for i, device in enumerate(devices):
        if device['max_input_channels'] > 0:
            device['index'] = i
            print(f"Index {i}: {device['name']} - {device['max_input_channels']} channels - {device['default_samplerate']} Hz")
            input_devices.append(device)
    return input_devices

def record_audio(device_info, seconds=10):
    try:
        device_index = device_info['index']
        channels = 1  # 強制單聲道
        rate = 16000  # 強制 16000 Hz

        print(f"\n??? 使用設備: {device_info['name']}")
        print(f"?? 設備索引: {device_index}")
        print(f"?? 通道數(shù): {channels}")
        print(f"?? 采樣率: {rate} Hz\n")

        print("?? 檢查設備配置...")
        sd.check_input_settings(device=device_index, channels=channels, samplerate=rate, dtype='int16')
        print("? 配置有效")

        print("??? 正在錄音中...")
        audio_data = sd.rec(int(seconds * rate), samplerate=rate, channels=channels, dtype='int16', device=device_index)
        sd.wait()

        safe_device_name = re.sub(r'[^\w\s-]', '_', device_info['name']).replace('\r', '').replace('\n', '').strip()
        output_file = f"{safe_device_name}_output.wav"

        with wave.open(output_file, 'wb') as wf:
            wf.setnchannels(channels)
            wf.setsampwidth(2)
            wf.setframerate(rate)
            wf.writeframes(audio_data.tobytes())

        print(f"?? 錄音已保存為 {output_file}")

    except sd.PortAudioError as pae:
        print(f"? 音頻設備錯誤：{pae}")
    except OSError as ose:
        print(f"? 文件系統(tǒng)錯誤：{ose}")
    except Exception as e:
        print(f"? 未知錯誤：{e}")

if __name__ == "__main__":
    print("?? 使用默認音頻接口")
    input_devices = list_input_devices()
    if input_devices:
        for device in input_devices:
            if 'EDIFIER W820NB' in device['name'] and 'Hands-Free' in device['name']:
                print(f"正在測試耳機設備: {device['name']}")
                record_audio(device)
    else:
        print("? 沒有可用的音頻輸入設備。")

代碼 2 , 使用 whisper 轉為文字效果很勉強，見文末總結。

import sounddevice as sd
import numpy as np
import wave
import tempfile
import os
import whisper

# 加載 Whisper 模型
model = whisper.load_model("medium")  # 可改為 "tiny", "base", "small", "large"

# 音頻錄制設置
CHANNELS = 1  # 單聲道，Hands-Free 模式通常只支持 1 通道
RATE = 16000  # 16000 Hz，適合 Hands-Free 模式
RECORD_SECONDS = 5  # 每次錄音時長（秒）
DEVICE_INDEX = 27  # 已驗證可用的設備索引
DEVICE_NAME = "耳機 (@System32\drivers\bthhfenum.sys,#2;%1 Hands-Free AG Audio%0;(EDIFIER W820NB 雙金標版))"

def record_audio(seconds=RECORD_SECONDS):
    try:
        print(f"?? 正在錄音 {seconds} 秒...")
        # 使用 sounddevice 錄制音頻
        audio_data = sd.rec(
            int(seconds * RATE),
            samplerate=RATE,
            channels=CHANNELS,
            dtype='int16',
            device=DEVICE_INDEX
        )
        sd.wait()  # 等待錄音完成

        # 保存臨時音頻文件
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile:
            with wave.open(tmpfile.name, 'wb') as wf:
                wf.setnchannels(CHANNELS)
                wf.setsampwidth(2)  # 16-bit 音頻
                wf.setframerate(RATE)
                wf.writeframes(audio_data.tobytes())
            return tmpfile.name

    except sd.PortAudioError as pae:
        print(f"? 音頻設備錯誤：{pae}")
        return None
    except Exception as e:
        print(f"? 未知錯誤：{e}")
        return None

def transcribe_audio(audio_file):
    try:
        print("?? 正在識別...")
        result = model.transcribe(audio_file, language="zh")
        print("?? 識別結果:", result['text'].strip())
    except Exception as e:
        print(f"? 語音識別失敗：{e}")
    finally:
        if os.path.exists(audio_file):
            os.remove(audio_file)

if __name__ == "__main__":
    print(f"?? 使用設備: {DEVICE_NAME} (索引: {DEVICE_INDEX})")
    print("??? 開始實時聽寫，按 Ctrl+C 停止")

    try:
        while True:
            # 錄制音頻
            audio_file = record_audio()
            if audio_file:
                # 進行語音識別
                transcribe_audio(audio_file)
            else:
                print("?? 錄音失敗，跳過識別")
            # 短暫暫停，避免過于頻繁的錄音
            sd.sleep(100)  # 100 毫秒

    except KeyboardInterrupt:
        print("?? 停止實時識別")
    except Exception as e:
        print(f"? 程序錯誤：{e}")