TTS/KokoroTtsSanitizer/KokoroTtsSanitizer.cscsharp

Documentation

KokoroTtsSanitizer

Prepare assistant text for speech synthesis.

KokoroTtsSanitizer.cs

  • Strips emojis, markdown, URLs, and noisy symbols before TTS runs.
using System.Text;using System.Text.RegularExpressions;namespace Agent.Core.TextToSpeech;/// <summary>/// Prepares chat/assistant text for Kokoro TTS by removing symbols, emojis, and markup/// that should not be read aloud./// </summary>public static partial class KokoroTtsSanitizer{    // Surrogate pairs + misc symbols; avoids \p{Extended_Pictographic} (not in all regex engines).    private static readonly Regex EmojiRegex = new(        @"[\u2600-\u27BF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|\uFE0F|\u200D",        RegexOptions.Compiled);    private static readonly Regex UrlRegex = UrlPattern();    private static readonly Regex MarkdownLinkRegex = MarkdownLinkPattern();    private static readonly Regex MarkdownImageRegex = MarkdownImagePattern();    private static readonly Regex FenceRegex = FencePattern();    private static readonly Regex InlineCodeRegex = InlineCodePattern();    private static readonly Regex HtmlTagRegex = HtmlTagPattern();    private static readonly Regex HeaderPrefixRegex = HeaderPrefixPattern();    private static readonly Regex BulletPrefixRegex = BulletPrefixPattern();    private static readonly Regex BlockquotePrefixRegex = BlockquotePrefixPattern();    private static readonly Regex HorizontalRuleRegex = HorizontalRulePattern();    private static readonly Regex MentionRegex = MentionPattern();    private static readonly Regex HashtagWordRegex = HashtagWordPattern();    private static readonly Regex MarkdownEmphasisRegex = MarkdownEmphasisPattern();    private static readonly Regex ControlCharsRegex = ControlCharsPattern();    private static readonly Regex OddSymbolsRegex = OddSymbolsPattern();    private static readonly Regex WhitespaceRegex = WhitespacePattern();    /// <summary>Returns speech-safe plain text, or empty if nothing remains.</summary>    public static string Sanitize(string? text)    {        if (string.IsNullOrWhiteSpace(text))        {            return string.Empty;        }        string result = text.Normalize(NormalizationForm.FormKC);        result = FenceRegex.Replace(result, " ");        result = InlineCodeRegex.Replace(result, " ");        result = MarkdownImageRegex.Replace(result, " ");        result = MarkdownLinkRegex.Replace(result, "$1");        result = UrlRegex.Replace(result, " ");        result = HtmlTagRegex.Replace(result, " ");        result = HeaderPrefixRegex.Replace(result, " ");        result = BlockquotePrefixRegex.Replace(result, " ");        result = BulletPrefixRegex.Replace(result, " ");        result = HorizontalRuleRegex.Replace(result, " ");        result = MentionRegex.Replace(result, " ");        result = HashtagWordRegex.Replace(result, "$1");        result = MarkdownEmphasisRegex.Replace(result, "$1");        result = EmojiRegex.Replace(result, " ");        result = ControlCharsRegex.Replace(result, " ");        result = OddSymbolsRegex.Replace(result, " ");        result = result.Replace('\u2018', '\'').Replace('\u2019', '\'');        result = result.Replace('\u201C', '"').Replace('\u201D', '"');        result = result.Replace('\u2014', ' ').Replace('\u2013', ' ');        result = result.Replace("&nbsp;", " ", StringComparison.OrdinalIgnoreCase);        result = result.Replace("&amp;", " and ", StringComparison.OrdinalIgnoreCase);        result = result.Replace("&lt;", " ").Replace("&gt;", " ");        result = WhitespaceRegex.Replace(result, " ").Trim();        return result;    }    [GeneratedRegex(@"```[\s\S]*?```", RegexOptions.Compiled)]    private static partial Regex FencePattern();    [GeneratedRegex(@"`[^`\r\n]+`", RegexOptions.Compiled)]    private static partial Regex InlineCodePattern();    [GeneratedRegex(@"!\[[^\]]*\]\([^)]*\)", RegexOptions.Compiled)]    private static partial Regex MarkdownImagePattern();    [GeneratedRegex(@"\[([^\]]+)\]\([^)]*\)", RegexOptions.Compiled)]    private static partial Regex MarkdownLinkPattern();    [GeneratedRegex(@"https?://[^\s<>()\[\]]+", RegexOptions.Compiled | RegexOptions.IgnoreCase)]    private static partial Regex UrlPattern();    [GeneratedRegex(@"<[^>]+>", RegexOptions.Compiled)]    private static partial Regex HtmlTagPattern();    [GeneratedRegex(@"^#{1,6}\s+", RegexOptions.Compiled | RegexOptions.Multiline)]    private static partial Regex HeaderPrefixPattern();    [GeneratedRegex(@"^>\s?", RegexOptions.Compiled | RegexOptions.Multiline)]    private static partial Regex BlockquotePrefixPattern();    [GeneratedRegex(@"^[\s]*[-*+]\s+", RegexOptions.Compiled | RegexOptions.Multiline)]    private static partial Regex BulletPrefixPattern();    [GeneratedRegex(@"^[\s]*[-*_]{3,}[\s]*$", RegexOptions.Compiled | RegexOptions.Multiline)]    private static partial Regex HorizontalRulePattern();    [GeneratedRegex(@"@[\w.]+", RegexOptions.Compiled)]    private static partial Regex MentionPattern();    [GeneratedRegex(@"#([\w]+)", RegexOptions.Compiled)]    private static partial Regex HashtagWordPattern();    [GeneratedRegex(@"\*\*([^*]+)\*\*|\*([^*]+)\*|__([^_]+)__|_([^_]+)_|~~([^~]+)~~", RegexOptions.Compiled)]    private static partial Regex MarkdownEmphasisPattern();    [GeneratedRegex(@"[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F]", RegexOptions.Compiled)]    private static partial Regex ControlCharsPattern();    /// <summary>Symbols that TTS often reads literally; keep natural punctuation.</summary>    [GeneratedRegex(@"[{}\[\]|\\<>#^=+~`|]", RegexOptions.Compiled)]    private static partial Regex OddSymbolsPattern();    [GeneratedRegex(@"\s+", RegexOptions.Compiled)]    private static partial Regex WhitespacePattern();}