Documentation
¶
Overview ¶
Speech recognition with Next-gen Kaldi.
sherpa-onnx is an open-source speech recognition framework for Next-gen Kaldi. It depends only on onnxruntime, supporting both streaming and non-streaming speech recognition.
It does not need to access the network during recognition and everything runs locally.
It supports a variety of platforms, such as Linux (x86_64, aarch64, arm), Windows (x86_64, x86), macOS (x86_64, arm64), etc.
Usage examples:
Real-time speech recognition from a microphone
Decode files using a non-streaming model
Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/non-streaming-decode-files
Decode files using a streaming model
Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/streaming-decode-files
Convert text to speech using a non-streaming model
Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/non-streaming-tts
Index ¶
- func DeleteAudioTagging(tagging *AudioTagging)
- func DeleteCircularBuffer(buffer *CircularBuffer)
- func DeleteKeywordSpotter(spotter *KeywordSpotter)
- func DeleteOfflinePunc(punc *OfflinePunctuation)
- func DeleteOfflineRecognizer(recognizer *OfflineRecognizer)
- func DeleteOfflineSpeakerDiarization(sd *OfflineSpeakerDiarization)
- func DeleteOfflineSpeechDenoiser(sd *OfflineSpeechDenoiser)
- func DeleteOfflineStream(stream *OfflineStream)
- func DeleteOfflineTts(tts *OfflineTts)
- func DeleteOnlineRecognizer(recognizer *OnlineRecognizer)
- func DeleteOnlineStream(stream *OnlineStream)
- func DeleteSpeakerEmbeddingExtractor(ex *SpeakerEmbeddingExtractor)
- func DeleteSpeakerEmbeddingManager(m *SpeakerEmbeddingManager)
- func DeleteSpokenLanguageIdentification(slid *SpokenLanguageIdentification)
- func DeleteVoiceActivityDetector(vad *VoiceActivityDetector)
- func GetGitDate() string
- func GetGitSha1() string
- func GetVersion() string
- type AudioEvent
- type AudioTagging
- type AudioTaggingConfig
- type AudioTaggingModelConfig
- type CircularBuffer
- type DenoisedAudio
- type FastClusteringConfig
- type FeatureConfig
- type GeneratedAudio
- type HomophoneReplacerConfig
- type KeywordSpotter
- type KeywordSpotterConfig
- type KeywordSpotterResult
- type OfflineCanaryModelConfig
- type OfflineDolphinModelConfig
- type OfflineFireRedAsrModelConfig
- type OfflineLMConfig
- type OfflineModelConfig
- type OfflineMoonshineModelConfig
- type OfflineNemoEncDecCtcModelConfig
- type OfflineOmnilingualAsrCtcModelConfig
- type OfflineParaformerModelConfig
- type OfflinePunctuation
- type OfflinePunctuationConfig
- type OfflinePunctuationModelConfig
- type OfflineRecognizer
- type OfflineRecognizerConfig
- type OfflineRecognizerResult
- type OfflineSenseVoiceModelConfig
- type OfflineSpeakerDiarization
- type OfflineSpeakerDiarizationConfig
- type OfflineSpeakerDiarizationSegment
- type OfflineSpeakerSegmentationModelConfig
- type OfflineSpeakerSegmentationPyannoteModelConfig
- type OfflineSpeechDenoiser
- type OfflineSpeechDenoiserConfig
- type OfflineSpeechDenoiserGtcrnModelConfig
- type OfflineSpeechDenoiserModelConfig
- type OfflineStream
- type OfflineTdnnModelConfig
- type OfflineTransducerModelConfig
- type OfflineTts
- func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedAudio
- func (tts *OfflineTts) GenerateWithCallback(text string, sid int, speed float32, ...)
- func (tts *OfflineTts) GenerateWithProgressCallback(text string, sid int, speed float32, ...)
- func (tts *OfflineTts) GenerateWithZipvoice(text, promptText string, promptSamples []float32, promptSampleRate int, ...) *GeneratedAudio
- type OfflineTtsConfig
- type OfflineTtsKittenModelConfig
- type OfflineTtsKokoroModelConfig
- type OfflineTtsMatchaModelConfig
- type OfflineTtsModelConfig
- type OfflineTtsVitsModelConfig
- type OfflineTtsZipvoiceModelConfig
- type OfflineWenetCtcModelConfig
- type OfflineWhisperModelConfig
- type OfflineZipformerAudioTaggingModelConfig
- type OfflineZipformerCtcModelConfig
- type OnlineCtcFstDecoderConfig
- type OnlineModelConfig
- type OnlineNemoCtcModelConfig
- type OnlineParaformerModelConfig
- type OnlineRecognizer
- func (recognizer *OnlineRecognizer) Decode(s *OnlineStream)
- func (recognizer *OnlineRecognizer) DecodeStreams(s []*OnlineStream)
- func (recognizer *OnlineRecognizer) GetResult(s *OnlineStream) *OnlineRecognizerResult
- func (recognizer *OnlineRecognizer) IsEndpoint(s *OnlineStream) bool
- func (recognizer *OnlineRecognizer) IsReady(s *OnlineStream) bool
- func (recognizer *OnlineRecognizer) Reset(s *OnlineStream)
- type OnlineRecognizerConfig
- type OnlineRecognizerResult
- type OnlineStream
- type OnlineToneCtcModelConfig
- type OnlineTransducerModelConfig
- type OnlineZipformer2CtcModelConfig
- type SileroVadModelConfig
- type SpeakerEmbeddingExtractor
- type SpeakerEmbeddingExtractorConfig
- type SpeakerEmbeddingManager
- func (m *SpeakerEmbeddingManager) AllSpeakers() []string
- func (m *SpeakerEmbeddingManager) Contains(name string) bool
- func (m *SpeakerEmbeddingManager) NumSpeakers() int
- func (m *SpeakerEmbeddingManager) Register(name string, embedding []float32) bool
- func (m *SpeakerEmbeddingManager) RegisterV(name string, embeddings [][]float32) bool
- func (m *SpeakerEmbeddingManager) Remove(name string) bool
- func (m *SpeakerEmbeddingManager) Search(embedding []float32, threshold float32) string
- func (m *SpeakerEmbeddingManager) Verify(name string, embedding []float32, threshold float32) bool
- type SpeechSegment
- type SpokenLanguageIdentification
- type SpokenLanguageIdentificationConfig
- type SpokenLanguageIdentificationResult
- type SpokenLanguageIdentificationWhisperConfig
- type TenVadModelConfig
- type VadModelConfig
- type VoiceActivityDetector
- func (vad *VoiceActivityDetector) AcceptWaveform(samples []float32)
- func (vad *VoiceActivityDetector) Clear()
- func (vad *VoiceActivityDetector) Flush()
- func (vad *VoiceActivityDetector) Front() *SpeechSegment
- func (vad *VoiceActivityDetector) IsEmpty() bool
- func (vad *VoiceActivityDetector) IsSpeech() bool
- func (vad *VoiceActivityDetector) Pop()
- func (vad *VoiceActivityDetector) Reset()
- type Wave
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func DeleteAudioTagging ¶ added in v1.10.44
func DeleteAudioTagging(tagging *AudioTagging)
func DeleteCircularBuffer ¶ added in v1.9.15
func DeleteCircularBuffer(buffer *CircularBuffer)
func DeleteKeywordSpotter ¶ added in v1.10.37
func DeleteKeywordSpotter(spotter *KeywordSpotter)
Free the internal pointer inside the recognizer to avoid memory leak.
func DeleteOfflinePunc ¶ added in v1.10.29
func DeleteOfflinePunc(punc *OfflinePunctuation)
func DeleteOfflineRecognizer ¶
func DeleteOfflineRecognizer(recognizer *OfflineRecognizer)
Frees the internal pointer of the recognition to avoid memory leak.
func DeleteOfflineSpeakerDiarization ¶ added in v1.10.28
func DeleteOfflineSpeakerDiarization(sd *OfflineSpeakerDiarization)
func DeleteOfflineSpeechDenoiser ¶ added in v1.11.0
func DeleteOfflineSpeechDenoiser(sd *OfflineSpeechDenoiser)
Free the internal pointer inside the OfflineSpeechDenoiser to avoid memory leak.
func DeleteOfflineStream ¶
func DeleteOfflineStream(stream *OfflineStream)
Frees the internal pointer of the stream to avoid memory leak.
func DeleteOfflineTts ¶ added in v1.8.4
func DeleteOfflineTts(tts *OfflineTts)
Free the internal pointer inside the tts to avoid memory leak.
func DeleteOnlineRecognizer ¶
func DeleteOnlineRecognizer(recognizer *OnlineRecognizer)
Free the internal pointer inside the recognizer to avoid memory leak.
func DeleteOnlineStream ¶
func DeleteOnlineStream(stream *OnlineStream)
Delete the internal pointer inside the stream to avoid memory leak.
func DeleteSpeakerEmbeddingExtractor ¶ added in v1.9.15
func DeleteSpeakerEmbeddingExtractor(ex *SpeakerEmbeddingExtractor)
func DeleteSpeakerEmbeddingManager ¶ added in v1.9.15
func DeleteSpeakerEmbeddingManager(m *SpeakerEmbeddingManager)
func DeleteSpokenLanguageIdentification ¶ added in v1.9.15
func DeleteSpokenLanguageIdentification(slid *SpokenLanguageIdentification)
func DeleteVoiceActivityDetector ¶ added in v1.9.15
func DeleteVoiceActivityDetector(vad *VoiceActivityDetector)
func GetGitDate ¶ added in v1.12.2
func GetGitDate() string
func GetGitSha1 ¶ added in v1.12.2
func GetGitSha1() string
func GetVersion ¶ added in v1.12.2
func GetVersion() string
Types ¶
type AudioEvent ¶ added in v1.10.44
type AudioTagging ¶ added in v1.10.44
type AudioTagging struct {
// contains filtered or unexported fields
}
func NewAudioTagging ¶ added in v1.10.44
func NewAudioTagging(config *AudioTaggingConfig) *AudioTagging
The user is responsible to invoke DeleteAudioTagging() to free the returned tagger to avoid memory leak
func (*AudioTagging) Compute ¶ added in v1.10.44
func (tagging *AudioTagging) Compute(s *OfflineStream, topK int32) []AudioEvent
type AudioTaggingConfig ¶ added in v1.10.44
type AudioTaggingConfig struct {
Model AudioTaggingModelConfig
Labels string
TopK int32
}
type AudioTaggingModelConfig ¶ added in v1.10.44
type AudioTaggingModelConfig struct {
Zipformer OfflineZipformerAudioTaggingModelConfig
Ced string
NumThreads int32
Debug int32
Provider string
}
type CircularBuffer ¶ added in v1.9.15
type CircularBuffer struct {
// contains filtered or unexported fields
}
func NewCircularBuffer ¶ added in v1.9.15
func NewCircularBuffer(capacity int) *CircularBuffer
func (*CircularBuffer) Get ¶ added in v1.9.15
func (buffer *CircularBuffer) Get(start int, n int) []float32
func (*CircularBuffer) Head ¶ added in v1.9.15
func (buffer *CircularBuffer) Head() int
func (*CircularBuffer) Pop ¶ added in v1.9.15
func (buffer *CircularBuffer) Pop(n int)
func (*CircularBuffer) Push ¶ added in v1.9.15
func (buffer *CircularBuffer) Push(samples []float32)
func (*CircularBuffer) Reset ¶ added in v1.9.15
func (buffer *CircularBuffer) Reset()
func (*CircularBuffer) Size ¶ added in v1.9.15
func (buffer *CircularBuffer) Size() int
type DenoisedAudio ¶ added in v1.11.0
type DenoisedAudio struct {
// Normalized samples in the range [-1, 1]
Samples []float32
SampleRate int
}
func (*DenoisedAudio) Save ¶ added in v1.11.0
func (audio *DenoisedAudio) Save(filename string) bool
type FastClusteringConfig ¶ added in v1.10.28
type FeatureConfig ¶
type FeatureConfig struct {
// Sample rate expected by the model. It is 16000 for all
// pre-trained models provided by us
SampleRate int
// Feature dimension expected by the model. It is 80 for all
// pre-trained models provided by us
FeatureDim int
}
Configuration for the feature extractor
type GeneratedAudio ¶ added in v1.8.4
type GeneratedAudio struct {
// Normalized samples in the range [-1, 1]
Samples []float32
SampleRate int
}
func (*GeneratedAudio) Save ¶ added in v1.8.4
func (audio *GeneratedAudio) Save(filename string) bool
type HomophoneReplacerConfig ¶ added in v1.11.4
type KeywordSpotter ¶ added in v1.10.37
type KeywordSpotter struct {
// contains filtered or unexported fields
}
func NewKeywordSpotter ¶ added in v1.10.37
func NewKeywordSpotter(config *KeywordSpotterConfig) *KeywordSpotter
The user is responsible to invoke DeleteKeywordSpotter() to free the returned spotter to avoid memory leak
func (*KeywordSpotter) Decode ¶ added in v1.10.37
func (spotter *KeywordSpotter) Decode(s *OnlineStream)
Decode the stream. Before calling this function, you have to ensure that spotter.IsReady(s) returns true. Otherwise, you will be SAD.
You usually use it like below:
for spotter.IsReady(s) {
spotter.Decode(s)
}
func (*KeywordSpotter) GetResult ¶ added in v1.10.37
func (spotter *KeywordSpotter) GetResult(s *OnlineStream) *KeywordSpotterResult
Get the current result of stream since the last invoke of Reset()
func (*KeywordSpotter) IsReady ¶ added in v1.10.37
func (spotter *KeywordSpotter) IsReady(s *OnlineStream) bool
Check whether the stream has enough feature frames for decoding. Return true if this stream is ready for decoding. Return false otherwise.
You will usually use it like below:
for spotter.IsReady(s) {
spotter.Decode(s)
}
func (*KeywordSpotter) Reset ¶ added in v1.10.41
func (spotter *KeywordSpotter) Reset(s *OnlineStream)
You MUST call it right after detecting a keyword
type KeywordSpotterConfig ¶ added in v1.10.37
type KeywordSpotterConfig struct {
FeatConfig FeatureConfig
ModelConfig OnlineModelConfig
MaxActivePaths int
KeywordsFile string
KeywordsScore float32
KeywordsThreshold float32
KeywordsBuf string
KeywordsBufSize int
}
Configuration for the online/streaming recognizer.
type KeywordSpotterResult ¶ added in v1.10.37
type KeywordSpotterResult struct {
Keyword string
}
type OfflineCanaryModelConfig ¶ added in v1.12.5
type OfflineDolphinModelConfig ¶ added in v1.11.3
type OfflineDolphinModelConfig struct {
Model string // Path to the model, e.g., model.onnx or model.int8.onnx
}
type OfflineFireRedAsrModelConfig ¶ added in v1.10.45
type OfflineLMConfig ¶
type OfflineLMConfig struct {
Model string // Path to the model
Scale float32 // scale for LM score
}
Configuration for offline LM.
type OfflineModelConfig ¶
type OfflineModelConfig struct {
Transducer OfflineTransducerModelConfig
Paraformer OfflineParaformerModelConfig
NemoCTC OfflineNemoEncDecCtcModelConfig
Whisper OfflineWhisperModelConfig
Tdnn OfflineTdnnModelConfig
SenseVoice OfflineSenseVoiceModelConfig
Moonshine OfflineMoonshineModelConfig
FireRedAsr OfflineFireRedAsrModelConfig
Dolphin OfflineDolphinModelConfig
ZipformerCtc OfflineZipformerCtcModelConfig
Canary OfflineCanaryModelConfig
WenetCtc OfflineWenetCtcModelConfig
Omnilingual OfflineOmnilingualAsrCtcModelConfig
Tokens string // Path to tokens.txt
// Number of threads to use for neural network computation
NumThreads int
// 1 to print model meta information while loading
Debug int
// Optional. Valid values: cpu, cuda, coreml
Provider string
// Optional. Specify it for faster model initialization.
ModelType string
ModelingUnit string // Optional. cjkchar, bpe, cjkchar+bpe
BpeVocab string // Optional.
TeleSpeechCtc string // Optional.
}
type OfflineMoonshineModelConfig ¶ added in v1.10.30
type OfflineNemoEncDecCtcModelConfig ¶
type OfflineNemoEncDecCtcModelConfig struct {
Model string // Path to the model, e.g., model.onnx or model.int8.onnx
}
Configuration for offline/non-streaming NeMo CTC models.
Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/index.html to download pre-trained models
type OfflineOmnilingualAsrCtcModelConfig ¶ added in v1.12.17
type OfflineOmnilingualAsrCtcModelConfig struct {
Model string // Path to the model, e.g., model.onnx or model.int8.onnx
}
type OfflineParaformerModelConfig ¶
type OfflineParaformerModelConfig struct {
Model string // Path to the model, e.g., model.onnx or model.int8.onnx
}
Configuration for offline/non-streaming paraformer.
please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/index.html to download pre-trained models
type OfflinePunctuation ¶ added in v1.10.29
type OfflinePunctuation struct {
// contains filtered or unexported fields
}
func NewOfflinePunctuation ¶ added in v1.10.29
func NewOfflinePunctuation(config *OfflinePunctuationConfig) *OfflinePunctuation
func (*OfflinePunctuation) AddPunct ¶ added in v1.10.29
func (punc *OfflinePunctuation) AddPunct(text string) string
type OfflinePunctuationConfig ¶ added in v1.10.29
type OfflinePunctuationConfig struct {
Model OfflinePunctuationModelConfig
}
type OfflinePunctuationModelConfig ¶ added in v1.10.29
type OfflinePunctuationModelConfig struct {
CtTransformer string
NumThreads C.int
Debug C.int // true to print debug information of the model
Provider string
}
============================================================ For punctuation ============================================================
type OfflineRecognizer ¶
type OfflineRecognizer struct {
// contains filtered or unexported fields
}
It wraps a pointer from C
func NewOfflineRecognizer ¶
func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer
The user is responsible to invoke DeleteOfflineRecognizer() to free the returned recognizer to avoid memory leak
func (*OfflineRecognizer) Decode ¶
func (recognizer *OfflineRecognizer) Decode(s *OfflineStream)
Decode the offline stream.
func (*OfflineRecognizer) DecodeStreams ¶
func (recognizer *OfflineRecognizer) DecodeStreams(s []*OfflineStream)
Decode multiple streams in parallel, i.e., in batch.
func (*OfflineRecognizer) SetConfig ¶ added in v1.11.0
func (r *OfflineRecognizer) SetConfig(config *OfflineRecognizerConfig)
Set new config to replace
type OfflineRecognizerConfig ¶
type OfflineRecognizerConfig struct {
FeatConfig FeatureConfig
ModelConfig OfflineModelConfig
LmConfig OfflineLMConfig
// Valid decoding method: greedy_search, modified_beam_search
DecodingMethod string
// Used only when DecodingMethod is modified_beam_search.
MaxActivePaths int
HotwordsFile string
HotwordsScore float32
BlankPenalty float32
RuleFsts string
RuleFars string
Hr HomophoneReplacerConfig
}
Configuration for the offline/non-streaming recognizer.
type OfflineRecognizerResult ¶
type OfflineRecognizerResult struct {
Text string
Tokens []string
Timestamps []float32
Durations []float32
Lang string
Emotion string
Event string
}
It contains recognition result of an offline stream.
type OfflineSenseVoiceModelConfig ¶ added in v1.10.17
type OfflineSpeakerDiarization ¶ added in v1.10.28
type OfflineSpeakerDiarization struct {
// contains filtered or unexported fields
}
func NewOfflineSpeakerDiarization ¶ added in v1.10.28
func NewOfflineSpeakerDiarization(config *OfflineSpeakerDiarizationConfig) *OfflineSpeakerDiarization
func (*OfflineSpeakerDiarization) Process ¶ added in v1.10.28
func (sd *OfflineSpeakerDiarization) Process(samples []float32) []OfflineSpeakerDiarizationSegment
func (*OfflineSpeakerDiarization) SampleRate ¶ added in v1.10.28
func (sd *OfflineSpeakerDiarization) SampleRate() int
func (*OfflineSpeakerDiarization) SetConfig ¶ added in v1.10.28
func (sd *OfflineSpeakerDiarization) SetConfig(config *OfflineSpeakerDiarizationConfig)
only config.Clustering is used. All other fields are ignored
type OfflineSpeakerDiarizationConfig ¶ added in v1.10.28
type OfflineSpeakerDiarizationConfig struct {
Segmentation OfflineSpeakerSegmentationModelConfig
Embedding SpeakerEmbeddingExtractorConfig
Clustering FastClusteringConfig
MinDurationOn float32
MinDurationOff float32
}
type OfflineSpeakerDiarizationSegment ¶ added in v1.10.28
type OfflineSpeakerSegmentationModelConfig ¶ added in v1.10.28
type OfflineSpeakerSegmentationModelConfig struct {
Pyannote OfflineSpeakerSegmentationPyannoteModelConfig
NumThreads int
Debug int
Provider string
}
type OfflineSpeakerSegmentationPyannoteModelConfig ¶ added in v1.10.28
type OfflineSpeakerSegmentationPyannoteModelConfig struct {
Model string
}
============================================================ For offline speaker diarization ============================================================
type OfflineSpeechDenoiser ¶ added in v1.11.0
type OfflineSpeechDenoiser struct {
// contains filtered or unexported fields
}
func NewOfflineSpeechDenoiser ¶ added in v1.11.0
func NewOfflineSpeechDenoiser(config *OfflineSpeechDenoiserConfig) *OfflineSpeechDenoiser
The user is responsible to invoke DeleteOfflineSpeechDenoiser() to free the returned tts to avoid memory leak
func (*OfflineSpeechDenoiser) Run ¶ added in v1.11.0
func (sd *OfflineSpeechDenoiser) Run(samples []float32, sampleRate int) *DenoisedAudio
func (*OfflineSpeechDenoiser) SampleRate ¶ added in v1.11.0
func (sd *OfflineSpeechDenoiser) SampleRate() int
type OfflineSpeechDenoiserConfig ¶ added in v1.11.0
type OfflineSpeechDenoiserConfig struct {
Model OfflineSpeechDenoiserModelConfig
}
type OfflineSpeechDenoiserGtcrnModelConfig ¶ added in v1.11.0
type OfflineSpeechDenoiserGtcrnModelConfig struct {
Model string
}
type OfflineSpeechDenoiserModelConfig ¶ added in v1.11.0
type OfflineSpeechDenoiserModelConfig struct {
Gtcrn OfflineSpeechDenoiserGtcrnModelConfig
NumThreads int32
Debug int32
Provider string
}
type OfflineStream ¶
type OfflineStream struct {
// contains filtered or unexported fields
}
It wraps a pointer from C
func NewAudioTaggingStream ¶ added in v1.10.44
func NewAudioTaggingStream(tagging *AudioTagging) *OfflineStream
The user is responsible to invoke DeleteOfflineStream() to free the returned stream to avoid memory leak
func NewOfflineStream ¶
func NewOfflineStream(recognizer *OfflineRecognizer) *OfflineStream
The user is responsible to invoke DeleteOfflineStream() to free the returned stream to avoid memory leak
func (*OfflineStream) AcceptWaveform ¶
func (s *OfflineStream) AcceptWaveform(sampleRate int, samples []float32)
Input audio samples for the offline stream. Please only call it once. That is, input all samples at once.
sampleRate is the sample rate of the input audio samples. If it is different from the value expected by the feature extractor, we will do resampling inside.
samples contains the actual audio samples. Each sample is in the range [-1, 1].
func (*OfflineStream) GetResult ¶
func (s *OfflineStream) GetResult() *OfflineRecognizerResult
Get the recognition result of the offline stream.
type OfflineTdnnModelConfig ¶ added in v1.7.8
type OfflineTdnnModelConfig struct {
Model string
}
type OfflineTransducerModelConfig ¶
type OfflineTransducerModelConfig struct {
Encoder string // Path to the encoder model, i.e., encoder.onnx or encoder.int8.onnx
Decoder string // Path to the decoder model
Joiner string // Path to the joiner model
}
Configuration for offline/non-streaming transducer.
Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/index.html to download pre-trained models
type OfflineTts ¶ added in v1.8.4
type OfflineTts struct {
// contains filtered or unexported fields
}
The offline tts class. It wraps a pointer from C.
func NewOfflineTts ¶ added in v1.8.4
func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts
The user is responsible to invoke DeleteOfflineTts() to free the returned tts to avoid memory leak
func (*OfflineTts) Generate ¶ added in v1.8.4
func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedAudio
func (*OfflineTts) GenerateWithCallback ¶ added in v1.12.0
func (tts *OfflineTts) GenerateWithCallback(text string, sid int, speed float32, cb sherpaOnnxGeneratedAudioCallbackWithArg)
func (*OfflineTts) GenerateWithProgressCallback ¶ added in v1.12.0
func (tts *OfflineTts) GenerateWithProgressCallback(text string, sid int, speed float32, cb sherpaOnnxGeneratedAudioProgressCallbackWithArg)
func (*OfflineTts) GenerateWithZipvoice ¶ added in v1.12.15
func (tts *OfflineTts) GenerateWithZipvoice( text, promptText string, promptSamples []float32, promptSampleRate int, speed float32, numSteps int, ) *GeneratedAudio
type OfflineTtsConfig ¶ added in v1.8.4
type OfflineTtsConfig struct {
Model OfflineTtsModelConfig
RuleFsts string
RuleFars string
MaxNumSentences int
SilenceScale float32
}
type OfflineTtsKittenModelConfig ¶ added in v1.12.8
type OfflineTtsKittenModelConfig struct {
Model string // Path to the model for kitten
Voices string // Path to the voices.bin for kitten
Tokens string // Path to tokens.txt
DataDir string // Path to espeak-ng-data directory
LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
}
type OfflineTtsKokoroModelConfig ¶ added in v1.10.41
type OfflineTtsKokoroModelConfig struct {
Model string // Path to the model for kokoro
Voices string // Path to the voices.bin for kokoro
Tokens string // Path to tokens.txt
DataDir string // Path to espeak-ng-data directory
DictDir string // unused
Lexicon string // Path to lexicon files
Lang string // Example: es for Spanish, fr-fr for French. Can be empty
LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
}
type OfflineTtsMatchaModelConfig ¶ added in v1.10.38
type OfflineTtsMatchaModelConfig struct {
AcousticModel string // Path to the acoustic model for MatchaTTS
Vocoder string // Path to the vocoder model for MatchaTTS
Lexicon string // Path to lexicon.txt
Tokens string // Path to tokens.txt
DataDir string // Path to espeak-ng-data directory
NoiseScale float32 // noise scale for vits models. Please use 0.667 in general
LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
DictDir string // unused
}
type OfflineTtsModelConfig ¶ added in v1.8.4
type OfflineTtsModelConfig struct {
Vits OfflineTtsVitsModelConfig
Matcha OfflineTtsMatchaModelConfig
Kokoro OfflineTtsKokoroModelConfig
Kitten OfflineTtsKittenModelConfig
Zipvoice OfflineTtsZipvoiceModelConfig
// Number of threads to use for neural network computation
NumThreads int
// 1 to print model meta information while loading
Debug int
// Optional. Valid values: cpu, cuda, coreml
Provider string
}
type OfflineTtsVitsModelConfig ¶ added in v1.8.4
type OfflineTtsVitsModelConfig struct {
Model string // Path to the VITS onnx model
Lexicon string // Path to lexicon.txt
Tokens string // Path to tokens.txt
DataDir string // Path to espeak-ng-data directory
NoiseScale float32 // noise scale for vits models. Please use 0.667 in general
NoiseScaleW float32 // noise scale for vits models. Please use 0.8 in general
LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
DictDir string // unused
}
Configuration for offline/non-streaming text-to-speech (TTS).
Please refer to https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html to download pre-trained models
type OfflineTtsZipvoiceModelConfig ¶ added in v1.12.15
type OfflineTtsZipvoiceModelConfig struct {
Tokens string // Path to tokens.txt for ZipVoice
Encoder string // Path to text encoder (e.g. encoder.onnx)
Decoder string // Path to flow-matching decoder (e.g. fm_decoder.onnx)
DataDir string // Path to espeak-ng-data
Lexicon string // Path to lexicon.txt (needed for zh)
Vocoder string // Path to vocoder (e.g. vocos_24khz.onnx)
FeatScale float32 // Feature scale
TShift float32 // t-shift (<1 shifts to smaller t)
TargetRms float32 // Target RMS for speech normalization
GuidanceScale float32 // CFG scale
}
type OfflineWenetCtcModelConfig ¶ added in v1.12.12
type OfflineWenetCtcModelConfig struct {
Model string // Path to the model, e.g., model.onnx or model.int8.onnx
}
type OfflineWhisperModelConfig ¶ added in v1.7.8
type OfflineZipformerAudioTaggingModelConfig ¶ added in v1.10.44
type OfflineZipformerAudioTaggingModelConfig struct {
Model string
}
Configuration for the audio tagging.
type OfflineZipformerCtcModelConfig ¶ added in v1.12.4
type OfflineZipformerCtcModelConfig struct {
Model string // Path to the model, e.g., model.onnx or model.int8.onnx
}
type OnlineCtcFstDecoderConfig ¶ added in v1.9.16
type OnlineModelConfig ¶ added in v1.7.6
type OnlineModelConfig struct {
Transducer OnlineTransducerModelConfig
Paraformer OnlineParaformerModelConfig
Zipformer2Ctc OnlineZipformer2CtcModelConfig
NemoCtc OnlineNemoCtcModelConfig
ToneCtc OnlineToneCtcModelConfig
Tokens string // Path to tokens.txt
NumThreads int // Number of threads to use for neural network computation
Provider string // Optional. Valid values are: cpu, cuda, coreml
Debug int // 1 to show model meta information while loading it.
ModelType string // Optional. You can specify it for faster model initialization
ModelingUnit string // Optional. cjkchar, bpe, cjkchar+bpe
BpeVocab string // Optional.
TokensBuf string // Optional.
TokensBufSize int // Optional.
}
Configuration for online/streaming models
Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/index.html to download pre-trained models
type OnlineNemoCtcModelConfig ¶ added in v1.12.8
type OnlineNemoCtcModelConfig struct {
Model string // Path to the onnx model
}
type OnlineParaformerModelConfig ¶ added in v1.7.6
type OnlineParaformerModelConfig struct {
Encoder string // Path to the encoder model, e.g., encoder.onnx or encoder.int8.onnx
Decoder string // Path to the decoder model.
}
Configuration for online/streaming paraformer models
Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/index.html to download pre-trained models
type OnlineRecognizer ¶
type OnlineRecognizer struct {
// contains filtered or unexported fields
}
The online recognizer class. It wraps a pointer from C.
func NewOnlineRecognizer ¶
func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer
The user is responsible to invoke DeleteOnlineRecognizer() to free the returned recognizer to avoid memory leak
func (*OnlineRecognizer) Decode ¶
func (recognizer *OnlineRecognizer) Decode(s *OnlineStream)
Decode the stream. Before calling this function, you have to ensure that recognizer.IsReady(s) returns true. Otherwise, you will be SAD.
You usually use it like below:
for recognizer.IsReady(s) {
recognizer.Decode(s)
}
func (*OnlineRecognizer) DecodeStreams ¶
func (recognizer *OnlineRecognizer) DecodeStreams(s []*OnlineStream)
Decode multiple streams in parallel, i.e., in batch. You have to ensure that each stream is ready for decoding. Otherwise, you will be SAD.
func (*OnlineRecognizer) GetResult ¶
func (recognizer *OnlineRecognizer) GetResult(s *OnlineStream) *OnlineRecognizerResult
Get the current result of stream since the last invoke of Reset()
func (*OnlineRecognizer) IsEndpoint ¶
func (recognizer *OnlineRecognizer) IsEndpoint(s *OnlineStream) bool
Return true if an endpoint is detected.
You usually use it like below:
if recognizer.IsEndpoint(s) {
// do your own stuff after detecting an endpoint
recognizer.Reset(s)
}
func (*OnlineRecognizer) IsReady ¶
func (recognizer *OnlineRecognizer) IsReady(s *OnlineStream) bool
Check whether the stream has enough feature frames for decoding. Return true if this stream is ready for decoding. Return false otherwise.
You will usually use it like below:
for recognizer.IsReady(s) {
recognizer.Decode(s)
}
func (*OnlineRecognizer) Reset ¶
func (recognizer *OnlineRecognizer) Reset(s *OnlineStream)
After calling this function, the internal neural network model states are reset and IsEndpoint(s) would return false. GetResult(s) would also return an empty string.
type OnlineRecognizerConfig ¶
type OnlineRecognizerConfig struct {
FeatConfig FeatureConfig
ModelConfig OnlineModelConfig
// Valid decoding methods: greedy_search, modified_beam_search
DecodingMethod string
// Used only when DecodingMethod is modified_beam_search. It specifies
// the maximum number of paths to keep during the search
MaxActivePaths int
EnableEndpoint int // 1 to enable endpoint detection.
// Please see
// https://k2-fsa.github.io/sherpa/ncnn/endpoint.html
// for the meaning of Rule1MinTrailingSilence, Rule2MinTrailingSilence
// and Rule3MinUtteranceLength.
Rule1MinTrailingSilence float32
Rule2MinTrailingSilence float32
Rule3MinUtteranceLength float32
HotwordsFile string
HotwordsScore float32
BlankPenalty float32
CtcFstDecoderConfig OnlineCtcFstDecoderConfig
RuleFsts string
RuleFars string
HotwordsBuf string
HotwordsBufSize int
Hr HomophoneReplacerConfig
}
Configuration for the online/streaming recognizer.
type OnlineRecognizerResult ¶
type OnlineRecognizerResult struct {
Text string
}
It contains the recognition result for a online stream.
type OnlineStream ¶
type OnlineStream struct {
// contains filtered or unexported fields
}
The online stream class. It wraps a pointer from C.
func NewKeywordStream ¶ added in v1.10.37
func NewKeywordStream(spotter *KeywordSpotter) *OnlineStream
The user is responsible to invoke DeleteOnlineStream() to free the returned stream to avoid memory leak
func NewKeywordStreamWithKeywords ¶ added in v1.10.37
func NewKeywordStreamWithKeywords(spotter *KeywordSpotter, keywords string) *OnlineStream
The user is responsible to invoke DeleteOnlineStream() to free the returned stream to avoid memory leak
func NewOnlineStream ¶
func NewOnlineStream(recognizer *OnlineRecognizer) *OnlineStream
The user is responsible to invoke DeleteOnlineStream() to free the returned stream to avoid memory leak
func (*OnlineStream) AcceptWaveform ¶
func (s *OnlineStream) AcceptWaveform(sampleRate int, samples []float32)
Input audio samples for the stream.
sampleRate is the actual sample rate of the input audio samples. If it is different from the sample rate expected by the feature extractor, we will do resampling inside.
samples contains audio samples. Each sample is in the range [-1, 1]
func (*OnlineStream) InputFinished ¶
func (s *OnlineStream) InputFinished()
Signal that there will be no incoming audio samples. After calling this function, you cannot call OnlineStream.AcceptWaveform any longer.
The main purpose of this function is to flush the remaining audio samples buffered inside for feature extraction.
type OnlineToneCtcModelConfig ¶ added in v1.12.12
type OnlineToneCtcModelConfig struct {
Model string // Path to the onnx model
}
type OnlineTransducerModelConfig ¶
type OnlineTransducerModelConfig struct {
Encoder string // Path to the encoder model, e.g., encoder.onnx or encoder.int8.onnx
Decoder string // Path to the decoder model.
Joiner string // Path to the joiner model.
}
Configuration for online/streaming transducer models
Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html to download pre-trained models
type OnlineZipformer2CtcModelConfig ¶ added in v1.9.7
type OnlineZipformer2CtcModelConfig struct {
Model string // Path to the onnx model
}
Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-ctc/index.html to download pre-trained models
type SileroVadModelConfig ¶ added in v1.9.15
type SileroVadModelConfig struct {
Model string
Threshold float32
MinSilenceDuration float32
MinSpeechDuration float32
WindowSize int
MaxSpeechDuration float32
}
============================================================ For VAD ============================================================
type SpeakerEmbeddingExtractor ¶ added in v1.9.15
type SpeakerEmbeddingExtractor struct {
// contains filtered or unexported fields
}
func NewSpeakerEmbeddingExtractor ¶ added in v1.9.15
func NewSpeakerEmbeddingExtractor(config *SpeakerEmbeddingExtractorConfig) *SpeakerEmbeddingExtractor
The user has to invoke DeleteSpeakerEmbeddingExtractor() to free the returned value to avoid memory leak
func (*SpeakerEmbeddingExtractor) Compute ¶ added in v1.9.15
func (ex *SpeakerEmbeddingExtractor) Compute(stream *OnlineStream) []float32
func (*SpeakerEmbeddingExtractor) CreateStream ¶ added in v1.9.15
func (ex *SpeakerEmbeddingExtractor) CreateStream() *OnlineStream
The user is responsible to invoke DeleteOnlineStream() to free the returned stream to avoid memory leak
func (*SpeakerEmbeddingExtractor) Dim ¶ added in v1.9.15
func (ex *SpeakerEmbeddingExtractor) Dim() int
func (*SpeakerEmbeddingExtractor) IsReady ¶ added in v1.9.15
func (ex *SpeakerEmbeddingExtractor) IsReady(stream *OnlineStream) bool
type SpeakerEmbeddingExtractorConfig ¶ added in v1.9.15
type SpeakerEmbeddingManager ¶ added in v1.9.15
type SpeakerEmbeddingManager struct {
// contains filtered or unexported fields
}
func NewSpeakerEmbeddingManager ¶ added in v1.9.15
func NewSpeakerEmbeddingManager(dim int) *SpeakerEmbeddingManager
The user has to invoke DeleteSpeakerEmbeddingManager() to free the returned value to avoid memory leak
func (*SpeakerEmbeddingManager) AllSpeakers ¶ added in v1.9.15
func (m *SpeakerEmbeddingManager) AllSpeakers() []string
func (*SpeakerEmbeddingManager) Contains ¶ added in v1.9.15
func (m *SpeakerEmbeddingManager) Contains(name string) bool
func (*SpeakerEmbeddingManager) NumSpeakers ¶ added in v1.9.15
func (m *SpeakerEmbeddingManager) NumSpeakers() int
func (*SpeakerEmbeddingManager) Register ¶ added in v1.9.15
func (m *SpeakerEmbeddingManager) Register(name string, embedding []float32) bool
func (*SpeakerEmbeddingManager) RegisterV ¶ added in v1.9.15
func (m *SpeakerEmbeddingManager) RegisterV(name string, embeddings [][]float32) bool
func (*SpeakerEmbeddingManager) Remove ¶ added in v1.9.15
func (m *SpeakerEmbeddingManager) Remove(name string) bool
type SpeechSegment ¶ added in v1.9.15
type SpokenLanguageIdentification ¶ added in v1.9.15
type SpokenLanguageIdentification struct {
// contains filtered or unexported fields
}
func NewSpokenLanguageIdentification ¶ added in v1.9.15
func NewSpokenLanguageIdentification(config *SpokenLanguageIdentificationConfig) *SpokenLanguageIdentification
func (*SpokenLanguageIdentification) Compute ¶ added in v1.9.15
func (slid *SpokenLanguageIdentification) Compute(stream *OfflineStream) *SpokenLanguageIdentificationResult
func (*SpokenLanguageIdentification) CreateStream ¶ added in v1.9.15
func (slid *SpokenLanguageIdentification) CreateStream() *OfflineStream
The user has to invoke DeleteOfflineStream() to free the returned value to avoid memory leak
type SpokenLanguageIdentificationConfig ¶ added in v1.9.15
type SpokenLanguageIdentificationConfig struct {
Whisper SpokenLanguageIdentificationWhisperConfig
NumThreads int
Debug int
Provider string
}
type SpokenLanguageIdentificationResult ¶ added in v1.9.15
type SpokenLanguageIdentificationResult struct {
Lang string
}
type SpokenLanguageIdentificationWhisperConfig ¶ added in v1.9.15
type TenVadModelConfig ¶ added in v1.12.6
type VadModelConfig ¶ added in v1.9.15
type VadModelConfig struct {
SileroVad SileroVadModelConfig
TenVad TenVadModelConfig
SampleRate int
NumThreads int
Provider string
Debug int
}
type VoiceActivityDetector ¶ added in v1.9.15
type VoiceActivityDetector struct {
// contains filtered or unexported fields
}
func NewVoiceActivityDetector ¶ added in v1.9.15
func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float32) *VoiceActivityDetector
func (*VoiceActivityDetector) AcceptWaveform ¶ added in v1.9.15
func (vad *VoiceActivityDetector) AcceptWaveform(samples []float32)
func (*VoiceActivityDetector) Clear ¶ added in v1.9.15
func (vad *VoiceActivityDetector) Clear()
func (*VoiceActivityDetector) Flush ¶ added in v1.10.13
func (vad *VoiceActivityDetector) Flush()
func (*VoiceActivityDetector) Front ¶ added in v1.9.15
func (vad *VoiceActivityDetector) Front() *SpeechSegment
func (*VoiceActivityDetector) IsEmpty ¶ added in v1.9.15
func (vad *VoiceActivityDetector) IsEmpty() bool
func (*VoiceActivityDetector) IsSpeech ¶ added in v1.9.15
func (vad *VoiceActivityDetector) IsSpeech() bool
func (*VoiceActivityDetector) Pop ¶ added in v1.9.15
func (vad *VoiceActivityDetector) Pop()
func (*VoiceActivityDetector) Reset ¶ added in v1.9.15
func (vad *VoiceActivityDetector) Reset()