Source code for persephone.utterance

from collections import defaultdict
from pathlib import Path
from typing import List, NamedTuple, Set, Sequence, Tuple, DefaultDict, Dict

Utterance = NamedTuple("Utterance", [("org_media_path", Path),
                                     ("org_transcription_path", Path),
                                     ("prefix", str),
                                     ("start_time", int),
                                     ("end_time", int),
                                     ("text", str),
                                     ("speaker", str)])
Utterance.__doc__= (
    """ An immutable object that represents a single utterance.

    `Utterance` instances capture key data about short segments of speech in
    the corpus.  Their most important role is in representing transcriptions
    in various states of preprocessing. For instance, `Utterance` instances may
    be created when reading from a linguists transcription files, in which case
    their `text` attribute is a raw unpreprocessed transcription. These
    `Utterance` instances may then be fed to a function that preprocesses the
    text, returning new `Utterance` instances with, say, phonemes delimited
    with spaces so that they are in an appropriate format for model training.

    Note that `Utterance` instances are not required as arguments to `Corpus`
    constructors. They exist to aid in preprocessing.

    Attributes:
        org_media_path: A `pathlib.Path` to the original source audio that contains the
            utterance (which may comprise many utterances).
        org_transcription_path: A `pathlib.Path` to the source of the transcription of
            the utterance (which may comprise many utterances in the
            case of, say, ELAN files).
        prefix: A string identifier for the utterance which is used to prefix the
            target wav and transcription files, which are called `<prefix>.wav`,
            `<prefix>.phonemes`, etc.
        start_time: An integer denoting the offset, in milliseconds, of the
            utterance in the original media file found in `org_media_path`.
        end_time: An integer denoting the endpoint, in milliseconds, of the
            utterance in the original media file found in `org_media_path`.
        text: A string representation of the transcription.
        speaker: A string representation of the speaker of the utterance.

    """)

def write_transcriptions(utterances: List[Utterance],
                         tgt_dir: Path, ext: str, lazy: bool) -> None:
    """ Write the utterance transcriptions to files in the tgt_dir. Is lazy and
    checks if the file already exists.

    Args:
        utterances: A list of Utterance objects to be written.
        tgt_dir: The directory in which to write the text of the utterances,
            one file per utterance.
        ext: The file extension for the utterances. Typically something like
            "phonemes", or "phonemes_and_tones".

    """

    tgt_dir.mkdir(parents=True, exist_ok=True)
    for utter in utterances:
        out_path = tgt_dir / "{}.{}".format(utter.prefix, ext)
        if lazy and out_path.is_file():
            continue
        with out_path.open("w") as f:
            print(utter.text, file=f)

def remove_duplicates(utterances: List[Utterance]) -> List[Utterance]:
    """ Removes utterances with the same start_time, end_time and text. Other
    metadata isn't considered.
    """

    filtered_utters = []
    utter_set = set() # type: Set[Tuple[int, int, str]]
    for utter in utterances:
        if (utter.start_time, utter.end_time, utter.text) in utter_set:
            continue
        filtered_utters.append(utter)
        utter_set.add((utter.start_time, utter.end_time, utter.text))

    return filtered_utters

def remove_empty_text(utterances: List[Utterance]) -> List[Utterance]:
    """Remove empty utterances from a list of utterances
    Args:
        utterances: The list of utterance we are processing
    """
    return [utter for utter in utterances if utter.text.strip() != ""]

# Doing everything in milliseconds now; other units are only for reporting to
# users
def duration(utter: Utterance) -> int:
    """Get the duration of an utterance in milliseconds
    Args:
        utter: The utterance we are finding the duration of
    """
    return utter.end_time - utter.start_time

def total_duration(utterances: List[Utterance]) -> int:

    """Get the duration of an entire list of utterances in milliseconds
    Args:
        utterances: The list of utterance we are finding the duration of
    """
    return sum([duration(utter) for utter in utterances])

def make_speaker_utters(utterances: List[Utterance]) -> Dict[str, List[Utterance]]:
    """ Creates a dictionary mapping from speakers to their utterances. """

    speaker_utters = defaultdict(list) # type: DefaultDict[str, List[Utterance]]
    for utter in utterances:
        speaker_utters[utter.speaker].append(utter)

    return speaker_utters

def speaker_durations(utterances: List[Utterance]) -> List[Tuple[str, int]]:
    """ Takes a list of utterances and itemizes them by speaker, returning a
    list of tuples of the form (Speaker Name, duration).
    """

    speaker_utters = make_speaker_utters(utterances)

    speaker_duration_tuples = [] # type: List[Tuple[str, int]]
    for speaker in speaker_utters:
        speaker_duration_tuples.append((speaker, total_duration(speaker_utters[speaker])))

    return speaker_duration_tuples

def remove_too_short(utterances: List[Utterance],
                     _winlen=25, winstep=10) -> List[Utterance]:
    """ Removes utterances that will probably have issues with CTC because of
    the number of frames being less than the number of tokens in the
    transcription. Assuming char tokenization to minimize false negatives.
    """
    def is_too_short(utterance: Utterance) -> bool:
        charlen = len(utterance.text)
        if (duration(utterance) / winstep) < charlen:
            return True
        else:
            return False

    return [utter for utter in utterances if not is_too_short(utter)]