"""
Offers functions for tokenizing utterances into phonemes, characters or
other symbols.
"""
from typing import Callable, Iterable, NamedTuple, Set, Dict
from ..utterance import Utterance
LabelSegmenter = NamedTuple("LabelSegmenter",
[("segment_labels", Callable[[Utterance], Utterance]),
("labels", Set[str])])
LabelSegmenter.__doc__ = (
""" An immutable object that segments the phonemes of an utterance. This
could probably actually have a __call__ implementation. That won't work
because namedtuples can't have special methods. Perhaps it could instead
just be a function which we give a labels attribute. Perhaps that
obfuscates things a bit, but it could be okay.
Attributes:
segment_labels: A function that takes an Utterance and returns another
Utterance where the text field has changed to be phonemically
segmented, using spaces as delimiters. Eg "this is" -> "th i s i s".
labels: A set of labels (eg. phonemes or tones) relevant for segmenting.
""")
UNICODE_WHITESPACE_CHARACTERS = [
"\u0009", # character tabulation
"\u000a", # line feed
"\u000b", # line tabulation
"\u000c", # form feed
"\u000d", # carriage return
"\u0020", # space
"\u0085", # next line
"\u00a0", # no-break space
"\u1680", # ogham space mark
"\u2000", # en quad
"\u2001", # em quad
"\u2002", # en space
"\u2003", # em space
"\u2004", # three-per-em space
"\u2005", # four-per-em space
"\u2006", # six-per-em space
"\u2007", # figure space
"\u2008", # punctuation space
"\u2009", # thin space
"\u200A", # hair space
"\u2028", # line separator
"\u2029", # paragraph separator
"\u202f", # narrow no-break space
"\u205f", # medium mathematical space
"\u3000", # ideographic space
]
def segment_into_chars(utterance: str) -> str:
""" Segments an utterance into space delimited characters. """
if not isinstance(utterance, str):
raise TypeError("Input type must be a string. Got {}.".format(type(utterance)))
utterance.strip()
utterance = utterance.replace(" ", "")
for char in UNICODE_WHITESPACE_CHARACTERS:
utterance = utterance.replace(char, "")
return " ".join(utterance)
def segment_into_tokens(utterance: str, token_inventory: Iterable[str]):
"""
Segments an utterance (a string) into tokens based on an inventory of
tokens (a list or set of strings).
The approach: Given the rest of the utterance, find the largest token (in
character length) that is found in the token_inventory, and treat that as a
token before segmenting the rest of the string.
Note: Your orthography may open the door to ambiguities in the
segmentation. Hopefully not, but another alternative it to simply segment
on characters with segment_into_chars()
"""
if not isinstance(utterance, str):
raise TypeError("Input type must be a string. Got {}.".format(type(utterance)))
# Token inventory needs to be hashable for speed
token_inventory = set(token_inventory)
# Get the size of the longest token in the inventory
max_len = len(sorted(list(token_inventory), key=lambda x: len(x))[-1])
def segment_token(utterance):
if utterance == "":
return "", ""
for i in range(max_len, 0, -1):
if utterance[:i] in token_inventory:
return utterance[:i], utterance[i:]
# If the next character is preventing segmentation, move on.
# TODO This needs to be logged with a warning on the first occurrence.
return "", utterance[1:]
tokens = []
head, tail = segment_token(utterance)
tokens.append(head)
while tail != "":
head, tail = segment_token(tail)
tokens.append(head)
tokens = [tok for tok in tokens if tok != ""]
return " ".join(tokens)
def make_indices_to_labels(labels: Set[str]) -> Dict[int, str]:
""" Creates a mapping from indices to labels. """
return {index: label for index, label in
enumerate(["pad"] + sorted(list(labels)))}