TTS/KokoroTts/KokoroTts.cscsharp

Documentation

KokoroTts

Local Kokoro text-to-speech engine.

KokoroTts.cs

  • KokoroTts — loads the Kokoro ONNX model and speaks sanitized text (default voice af_heart).
using System;using Agent.Core.Logging;using KokoroSharp;using KokoroSharp.Core;namespace Agent.Core.TextToSpeech;/// <summary>/// Local text-to-speech via <a href="https://github.com/Lyrcaxis/KokoroSharp/">KokoroSharp</a> (ONNX Kokoro 82M)./// </summary>public sealed class KokoroTts{    private readonly Logger _logger;    private readonly object _gate = new();    private KokoroTTS? _model;    private KokoroVoice? _voice;    public string VoiceId { get; private set; }    public bool WaitForTts { get; }    public bool IsReady    {        get        {            lock (_gate)            {                return _model is not null && _voice is not null;            }        }    }    public KokoroTts(Logger logger, KokoroTtsOptions? options = null)    {        _logger = logger ?? throw new ArgumentNullException(nameof(logger));        VoiceId = options?.VoiceId ?? "af_heart";        WaitForTts = options?.WaitForTts ?? false;    }    /// <summary>Load model and voice (may download ~320MB on first run).</summary>    public static async Task<KokoroTts> CreateAsync(        Logger logger,        KokoroTtsOptions? options = null,        CancellationToken cancellationToken = default)    {        var service = new KokoroTts(logger, options);        await service.InitializeAsync(cancellationToken).ConfigureAwait(false);        return service;    }    public async Task InitializeAsync(CancellationToken cancellationToken = default)    {        if (IsReady)        {            return;        }        await Task.Run(            () =>            {                lock (_gate)                {                    if (_model is not null)                    {                        return;                    }                    _logger.WriteLine(                        "TTS",                        "Loading Kokoro model (first run may download ~320MB)...",                        LogColors.Cyan);                    _model = KokoroTTS.LoadModel();                    _voice = KokoroVoiceManager.GetVoice(VoiceId)                        ?? throw new InvalidOperationException($"Kokoro voice '{VoiceId}' was not found.");                    _logger.WriteLine("TTS", $"Kokoro ready (voice: {VoiceId}).", LogColors.Cyan);                }            },            cancellationToken).ConfigureAwait(false);    }    public void SetVoice(string voiceId)    {        lock (_gate)        {            var newVoice = KokoroVoiceManager.GetVoice(voiceId);            if (newVoice is null)            {                throw new InvalidOperationException($"Kokoro voice '{voiceId}' was not found.");            }            _voice = newVoice;            VoiceId = voiceId;        }    }    /// <summary>Speak text using KokoroSharp playback queue.</summary>    public async Task SpeakAsync(string text, CancellationToken cancellationToken = default)    {        if (string.IsNullOrWhiteSpace(text))        {            return;        }        if (!IsReady)        {            await InitializeAsync(cancellationToken).ConfigureAwait(false);        }        string spoken = KokoroTtsSanitizer.Sanitize(text);        if (spoken.Length == 0)        {            return;        }        // Start playback under lock        SynthesisHandle? handle = null;        bool useSimulation = Environment.GetEnvironmentVariable("LILITH_TESTER") == "1";        if (!useSimulation)        {            try            {                lock (_gate)                {                    if (_model is null || _voice is null)                    {                        throw new InvalidOperationException("Kokoro TTS is not initialized.");                    }                    handle = _model.SpeakFast(spoken, _voice);                }            }            catch (Exception ex)            {                _logger.WriteLine("TTS", $"Playback initialization failed: {ex.Message}. Falling back to simulation.", LogColors.Orange);                useSimulation = true;            }        }        if (WaitForTts)        {            var tcs = new TaskCompletionSource<bool>();            int pct = 0;            using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);            if (useSimulation)            {                // Simulate speaking at a speed of ~50 characters per second (min 1s, max 3s total to keep tests fast)                int totalDurationMs = Math.Clamp(spoken.Length * 20, 1000, 3000);                _ = Task.Run(async () =>                {                    int steps = 10;                    int stepDurationMs = totalDurationMs / steps;                    for (int i = 1; i <= steps; i++)                    {                        if (cts.Token.IsCancellationRequested)                        {                            tcs.TrySetResult(false);                            return;                        }                        pct = i * 10;                        await Task.Delay(stepDurationMs).ConfigureAwait(false);                    }                    tcs.TrySetResult(true);                });            }            else if (handle != null)            {                handle.OnSpeechProgressed += p =>                {                    pct = spoken.Length > 0 ? (int)Math.Round((double)p.SpokenText_BestGuess.Length / spoken.Length * 100) : 0;                };                handle.OnSpeechCompleted += _ => tcs.TrySetResult(true);                handle.OnSpeechCanceled += _ => tcs.TrySetResult(false);            }            // Compute safe timeout: minimum 5s, +1s per 15 chars, safety factor of 3            int timeoutMs = (int)Math.Max(5000, ((spoken.Length / 15.0) + 5.0) * 3.0 * 1000.0);            cts.CancelAfter(timeoutMs);            var spinner = new[] { "|", "/", "-", "\\" };            int spinIdx = 0;            try            {                while (!tcs.Task.IsCompleted && !cts.Token.IsCancellationRequested)                {                    var spinChar = spinner[spinIdx % spinner.Length];                    spinIdx++;                    _logger.WriteProgress("TTS", $"Speaking... [{spinChar}] {pct}%");                    await Task.WhenAny(tcs.Task, Task.Delay(100, cts.Token)).ConfigureAwait(false);                }            }            catch (OperationCanceledException) when (cts.IsCancellationRequested && !cancellationToken.IsCancellationRequested)            {                // Timeout occurred            }            if (cancellationToken.IsCancellationRequested)            {                _logger.WriteLine("TTS", "[Cancelled] Speech was interrupted.", LogColors.Orange);            }            else if (tcs.Task.IsCompletedSuccessfully && tcs.Task.Result)            {                _logger.WriteProgress("TTS", "Completed [100%]");                _logger.WriteLine("TTS", "", LogColors.Green);            }            else if (cts.IsCancellationRequested && !tcs.Task.IsCompleted)            {                _logger.WriteLine("TTS", "[Timeout] Speech completion timed out (possibly no audio device).", LogColors.Orange);            }            else            {                _logger.WriteLine("TTS", "[Cancelled] Speech was interrupted.", LogColors.Orange);            }        }    }}