Source code for lingua_loop.services.text_normalization

"""Text normalization utilities for scoring comparisons."""

from abc import ABC
from abc import abstractmethod
from re import sub
from typing import Literal
from unicodedata import combining
from unicodedata import normalize

from lingua_loop.integrations.youtube.types import SupportedLanguageCodes

NormalizationForm = Literal["NFC", "NFKC", "NFD", "NFKD"]


[docs] class TextNormalizer(ABC): """Normalizes text for comparison/scoring purposes. The normalization is intentionally lossy and removes distinctions such as: - (language-specific) accent variations - case differences - punctuation As a result, texts like "Er hat gesagt" and "er hat gesagt" are considered equivalent after normalization. """ form: NormalizationForm = "NFKD"
[docs] def normalize(self, text: str) -> str: """Normalize text through all normalization steps.""" text = self.normalize_special_characters(text) text = self.normalize_case(text) text = self.normalize_punctuation(text) text = self.normalize_whitespace(text) return text
[docs] @abstractmethod def normalize_special_characters(self, text: str) -> str: """Replace or remove special characters (return text if no change). In some languages, like German, special characters like "ß" can be noramalized to written as "ss" and this does not change meaning. However, for languages like Italian, there is a difference between "è" and "e", so no special characters normalization is implemented for that. """ pass
[docs] def normalize_case(self, text: str) -> str: """Normalize text to lowercase.""" return text.lower()
[docs] def normalize_punctuation(self, text: str) -> str: """Remove punctuation from text.""" all_chars_except_words_and_single_spaces = r"[^\w\s]" space = " " return sub( pattern=all_chars_except_words_and_single_spaces, repl=space, string=text, )
[docs] def normalize_whitespace(self, text: str) -> str: """Collapse whitespace and strip.""" one_or_more_spaces = r"\s+" space = " " return sub(pattern=one_or_more_spaces, repl=space, string=text).strip()
[docs] class GenericNormalizer(TextNormalizer): """Normalizer that performs no special character normalization."""
[docs] def normalize_special_characters(self, text: str) -> str: """Return text unchanged.""" return text
[docs] class DutchNormalizer(TextNormalizer): """Normalizer for Dutch text."""
[docs] def normalize_special_characters(self, text: str) -> str: """Remove combining characters via NFKD normalization.""" text = normalize(self.form, text) text = "".join(c for c in text if not combining(c)) return text
[docs] class GermanNormalizer(TextNormalizer): """Normalizer for German text."""
[docs] def normalize_special_characters(self, text: str) -> str: """Replace German special characters with ASCII equivalents.""" text = ( text.replace("ß", "ss") .replace("ä", "ae") .replace("ö", "oe") .replace("ü", "ue") ) return text
[docs] class TextNormalizerFactory: """Factory for creating appropriate TextNormalizer instances.""" _language_code_to_normalizer = { SupportedLanguageCodes.DUTCH: DutchNormalizer, SupportedLanguageCodes.GERMAN: GermanNormalizer, } def __call__(self, language_code: SupportedLanguageCodes) -> TextNormalizer: """Get the appropriate normalizer for the given language code.""" normalizer_cls = self._language_code_to_normalizer.get( language_code, GenericNormalizer ) return normalizer_cls()
text_normalizer_factory = TextNormalizerFactory()