using System.Text;using System.Text.RegularExpressions;namespace Agent.Core.TextToSpeech;/// <summary>/// Prepares chat/assistant text for Kokoro TTS by removing symbols, emojis, and markup/// that should not be read aloud./// </summary>public static partial class KokoroTtsSanitizer{ // Surrogate pairs + misc symbols; avoids \p{Extended_Pictographic} (not in all regex engines). private static readonly Regex EmojiRegex = new( @"[\u2600-\u27BF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|\uFE0F|\u200D", RegexOptions.Compiled); private static readonly Regex UrlRegex = UrlPattern(); private static readonly Regex MarkdownLinkRegex = MarkdownLinkPattern(); private static readonly Regex MarkdownImageRegex = MarkdownImagePattern(); private static readonly Regex FenceRegex = FencePattern(); private static readonly Regex InlineCodeRegex = InlineCodePattern(); private static readonly Regex HtmlTagRegex = HtmlTagPattern(); private static readonly Regex HeaderPrefixRegex = HeaderPrefixPattern(); private static readonly Regex BulletPrefixRegex = BulletPrefixPattern(); private static readonly Regex BlockquotePrefixRegex = BlockquotePrefixPattern(); private static readonly Regex HorizontalRuleRegex = HorizontalRulePattern(); private static readonly Regex MentionRegex = MentionPattern(); private static readonly Regex HashtagWordRegex = HashtagWordPattern(); private static readonly Regex MarkdownEmphasisRegex = MarkdownEmphasisPattern(); private static readonly Regex ControlCharsRegex = ControlCharsPattern(); private static readonly Regex OddSymbolsRegex = OddSymbolsPattern(); private static readonly Regex WhitespaceRegex = WhitespacePattern(); /// <summary>Returns speech-safe plain text, or empty if nothing remains.</summary> public static string Sanitize(string? text) { if (string.IsNullOrWhiteSpace(text)) { return string.Empty; } string result = text.Normalize(NormalizationForm.FormKC); result = FenceRegex.Replace(result, " "); result = InlineCodeRegex.Replace(result, " "); result = MarkdownImageRegex.Replace(result, " "); result = MarkdownLinkRegex.Replace(result, "$1"); result = UrlRegex.Replace(result, " "); result = HtmlTagRegex.Replace(result, " "); result = HeaderPrefixRegex.Replace(result, " "); result = BlockquotePrefixRegex.Replace(result, " "); result = BulletPrefixRegex.Replace(result, " "); result = HorizontalRuleRegex.Replace(result, " "); result = MentionRegex.Replace(result, " "); result = HashtagWordRegex.Replace(result, "$1"); result = MarkdownEmphasisRegex.Replace(result, "$1"); result = EmojiRegex.Replace(result, " "); result = ControlCharsRegex.Replace(result, " "); result = OddSymbolsRegex.Replace(result, " "); result = result.Replace('\u2018', '\'').Replace('\u2019', '\''); result = result.Replace('\u201C', '"').Replace('\u201D', '"'); result = result.Replace('\u2014', ' ').Replace('\u2013', ' '); result = result.Replace(" ", " ", StringComparison.OrdinalIgnoreCase); result = result.Replace("&", " and ", StringComparison.OrdinalIgnoreCase); result = result.Replace("<", " ").Replace(">", " "); result = WhitespaceRegex.Replace(result, " ").Trim(); return result; } [GeneratedRegex(@"```[\s\S]*?```", RegexOptions.Compiled)] private static partial Regex FencePattern(); [GeneratedRegex(@"`[^`\r\n]+`", RegexOptions.Compiled)] private static partial Regex InlineCodePattern(); [GeneratedRegex(@"!\[[^\]]*\]\([^)]*\)", RegexOptions.Compiled)] private static partial Regex MarkdownImagePattern(); [GeneratedRegex(@"\[([^\]]+)\]\([^)]*\)", RegexOptions.Compiled)] private static partial Regex MarkdownLinkPattern(); [GeneratedRegex(@"https?://[^\s<>()\[\]]+", RegexOptions.Compiled | RegexOptions.IgnoreCase)] private static partial Regex UrlPattern(); [GeneratedRegex(@"<[^>]+>", RegexOptions.Compiled)] private static partial Regex HtmlTagPattern(); [GeneratedRegex(@"^#{1,6}\s+", RegexOptions.Compiled | RegexOptions.Multiline)] private static partial Regex HeaderPrefixPattern(); [GeneratedRegex(@"^>\s?", RegexOptions.Compiled | RegexOptions.Multiline)] private static partial Regex BlockquotePrefixPattern(); [GeneratedRegex(@"^[\s]*[-*+]\s+", RegexOptions.Compiled | RegexOptions.Multiline)] private static partial Regex BulletPrefixPattern(); [GeneratedRegex(@"^[\s]*[-*_]{3,}[\s]*$", RegexOptions.Compiled | RegexOptions.Multiline)] private static partial Regex HorizontalRulePattern(); [GeneratedRegex(@"@[\w.]+", RegexOptions.Compiled)] private static partial Regex MentionPattern(); [GeneratedRegex(@"#([\w]+)", RegexOptions.Compiled)] private static partial Regex HashtagWordPattern(); [GeneratedRegex(@"\*\*([^*]+)\*\*|\*([^*]+)\*|__([^_]+)__|_([^_]+)_|~~([^~]+)~~", RegexOptions.Compiled)] private static partial Regex MarkdownEmphasisPattern(); [GeneratedRegex(@"[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F]", RegexOptions.Compiled)] private static partial Regex ControlCharsPattern(); /// <summary>Symbols that TTS often reads literally; keep natural punctuation.</summary> [GeneratedRegex(@"[{}\[\]|\\<>#^=+~`|]", RegexOptions.Compiled)] private static partial Regex OddSymbolsPattern(); [GeneratedRegex(@"\s+", RegexOptions.Compiled)] private static partial Regex WhitespacePattern();}
Documentation
KokoroTtsSanitizer
Prepare assistant text for speech synthesis.
KokoroTtsSanitizer.cs
- Strips emojis, markdown, URLs, and noisy symbols before TTS runs.