fish-speech-start-local/fish_speech/text/spliter.py

import re
import string

from fish_speech.text.clean import clean_text


def utf_8_len(text: str):
    return len(text.encode("utf-8"))


def break_text(texts, length, splits: set):
    for text in texts:
        if utf_8_len(text) <= length:
            yield text
            continue

        curr = ""
        for char in text:
            curr += char

            if char in splits:
                yield curr
                curr = ""

        if curr:
            yield curr


def break_text_by_length(texts, length):
    for text in texts:
        if utf_8_len(text) <= length:
            yield text
            continue

        curr = ""
        for char in text:
            curr += char

            if utf_8_len(curr) >= length:
                yield curr
                curr = ""

        if curr:
            yield curr


def add_cleaned(curr, segments):
    curr = curr.strip()
    if curr and not all(c.isspace() or c in string.punctuation for c in curr):
        segments.append(curr)


def protect_float(text):
    # Turns 3.14 into <3_f_14> to prevent splitting
    return re.sub(r"(\d+)\.(\d+)", r"<\1_f_\2>", text)


def unprotect_float(text):
    # Turns <3_f_14> into 3.14
    return re.sub(r"<(\d+)_f_(\d+)>", r"\1.\2", text)


def split_text(text, length):
    text = clean_text(text)

    # Break the text into pieces with following rules:
    # 1. Split the text at ".", "!", "?" if text is NOT a float
    # 2. If the text is longer than length, split at ","
    # 3. If the text is still longer than length, split at " "
    # 4. If the text is still longer than length, split at any character to length

    texts = [text]
    texts = map(protect_float, texts)
    texts = break_text(texts, length, {".", "!", "?", "。", "！", "？"})
    texts = map(unprotect_float, texts)
    texts = break_text(texts, length, {",", "，"})
    texts = break_text(texts, length, {" "})
    texts = list(break_text_by_length(texts, length))

    # Then, merge the texts into segments with length <= length
    segments = []
    curr = ""

    for text in texts:
        if utf_8_len(curr) + utf_8_len(text) <= length:
            curr += text
        else:
            add_cleaned(curr, segments)
            curr = text

    if curr:
        add_cleaned(curr, segments)

    return segments


if __name__ == "__main__":
    # Test the split_text function

    text = "This is a test sentence. This is another test sentence. And a third one."

    assert split_text(text, 50) == [
        "This is a test sentence.",
        "This is another test sentence. And a third one.",
    ]
    assert split_text("a,aaaaaa3.14", 10) == ["a,", "aaaaaa3.14"]
    assert split_text("   ", 10) == []
    assert split_text("a", 10) == ["a"]

    text = "This is a test sentence with only commas, and no dots, and no exclamation marks, and no question marks, and no newlines."
    assert split_text(text, 50) == [
        "This is a test sentence with only commas,",
        "and no dots, and no exclamation marks,",
        "and no question marks, and no newlines.",
    ]

    text = "This is a test sentence This is a test sentence This is a test sentence. This is a test sentence, This is a test sentence, This is a test sentence."
    # First half split at " ", second half split at ","
    assert split_text(text, 50) == [
        "This is a test sentence This is a test sentence",
        "This is a test sentence. This is a test sentence,",
        "This is a test sentence, This is a test sentence.",
    ]

    text = "这是一段很长的中文文本,而且没有句号,也没有感叹号,也没有问号,也没有换行符。"
    assert split_text(text, 50) == [
        "这是一段很长的中文文本,",
        "而且没有句号,也没有感叹号,",
        "也没有问号,也没有换行符.",
    ]