Source code for persephone.distance

from typing import Any, Callable, List, Sequence, TypeVar, Tuple

import numpy as np

from persephone.exceptions import EmptyReferenceException

T = TypeVar("T")

[docs]def min_edit_distance(
        source: Sequence[T], target: Sequence[T],
        ins_cost: Callable[..., int] = lambda _x: 1,
        del_cost: Callable[..., int] = lambda _x: 1,
        sub_cost: Callable[..., int] = lambda x, y: 0 if x == y else 1) -> int:
    """Calculates the minimum edit distance between two sequences.

    Uses the Levenshtein weighting as a default, but offers keyword arguments
    to supply functions to measure the costs for editing with different
    elements.

    Args:
        ins_cost: A function describing the cost of inserting a given char
        del_cost: A function describing the cost of deleting a given char
        sub_cost: A function describing the cost of substituting one char for

    Returns:
        The edit distance between the two input sequences.

    """

    # Initialize an m+1 by n+1 array. Note that the strings start from index 1,
    # with index 0 being used to denote the empty string.
    n = len(target)
    m = len(source)
    distance = np.zeros((m+1, n+1), dtype=np.int16)

    # Initialize the zeroth row and column to be the distance from the empty
    # string.
    for i in range(1, m+1):
        distance[i, 0] = distance[i-1, 0] + ins_cost(source[i-1])
    for j in range(1, n+1):
        distance[0, j] = distance[0, j-1] + ins_cost(target[j-1])

    # Do the dynamic programming to fill in the matrix with the edit distances.
    for j in range(1, n+1):
        for i in range(1, m+1):
            distance[i, j] = min(
                    distance[i-1, j] + ins_cost(source[i-1]),
                    distance[i-1, j-1] + sub_cost(source[i-1],target[j-1]),
                    distance[i, j-1] + del_cost(target[j-1]))

    return int(distance[len(source), len(target)])

[docs]def min_edit_distance_align(
        # TODO Wrangle the typing errors in this function.
        # TODO This could work on generic sequences but for now it relies on
        # empty strings.
        #source: Sequence[str], target: Sequence[str],
        #ins_cost: Callable[..., int] = lambda _x: 1,
        #del_cost: Callable[..., int] = lambda _x: 1,
        #sub_cost: Callable[..., int] = lambda x, y: 0 if x == y else 1
        #) -> List[Tuple[str, str]]:
        source, target,
        ins_cost = lambda _x: 1,
        del_cost = lambda _x: 1,
        sub_cost = lambda x, y: 0 if x == y else 1):
    """Finds a minimum cost alignment between two strings.

    Uses the Levenshtein weighting as a default, but offers keyword arguments
    to supply functions to measure the costs for editing with different
    characters. Note that the alignment may not be unique.

    Args:
        ins_cost: A function describing the cost of inserting a given char
        del_cost: A function describing the cost of deleting a given char
        sub_cost: A function describing the cost of substituting one char for

    Returns:
        A sequence of tuples representing character level alignments between
        the source and target strings.
    """

    # Initialize an m+1 by n+1 array to hold the distances, and an equal sized
    # array to store the backpointers. Note that the strings start from index
    # 1, with index 0 being used to denote the empty string.
    n = len(target)
    m = len(source)
    dist = [[0]*(n+1) for _ in range(m+1)]
    bptrs = [[[]]*(n+1) for _ in range(m+1)] # type: List[List[List]]

    # Adjust the initialization of the first column and row of the matrices to
    # their appropriate values.
    for i in range(1, m+1):
        dist[i][0] = i
        bptrs[i][0] = (i-1, 0)
    for j in range(1, n+1):
        dist[0][j] = j
        bptrs[0][j] = (0, j-1)

    # Do the dynamic programming to fill in the matrix with the edit distances.
    for j in range(1, n+1):
        for i in range(1, m+1):
            options = [
                    (dist[i-1][j] + ins_cost(target[j-1]),
                        (i-1, j)),
                    (dist[i-1][j-1] + sub_cost(source[i-1],target[j-1]),
                        (i-1, j-1)),
                    (dist[i][j-1] + del_cost(source[i-1]),
                        (i, j-1))]
            (minimum, pointer) = sorted(options)[0]
            dist[i][j] = minimum
            bptrs[i][j] = pointer

    # Put the backtrace in a list, and reverse it to get a forward trace.
    bt = [(m,n)]
    cell = bptrs[m][n]
    while True:
        if not cell:
            # If it's an empty list or tuple, as will be the case at the start
            # of the trace
            break
        bt.append(cell)
        if bptrs[cell[0]][cell[1]]:
            cell = bptrs[cell[0]][cell[1]]
        else:
            break
    trace = list(reversed(bt))

    # Construct an alignment between source and target using the trace.
    alignment = []
    for i in range(1, len(trace)):
        current = trace[i]
        prev = trace[i-1]

        # If the cell is diagonal from the previous one, it's a substitution or
        # there wasn't a change.
        if current[0] - prev[0] == 1 and current[1] - prev[1] == 1:
            alignment.append((source[current[0]-1], target[current[1]-1]))
        # Otherwise if it moves only on the source side, it's a deletion
        elif current[0] - prev[0] == 1:
            alignment.append((source[current[0]-1], ""))
        # Otherwise if it moves only on the target side, it's an insertion
        elif current[1] - prev[1] == 1:
            alignment.append(("", target[current[1]-1]))

    return alignment

# TODO Wrangle the typing errors in this function.
#def cluster_alignment_errors(alignment: List[Tuple[str, str]]
#                            ) -> List[Tuple[Tuple[str, ...]]]:
def cluster_alignment_errors(alignment):
    # TODO Review documentation and consider for inclusion in API.
    """Clusters alignments
    Takes an alignment created by min_edit_distance_align() and groups
    consecutive errors together. This is useful, because there are often
    many possible alignments, and so often we can't meaningfully distinguish
    between alignment errors at the character level, so it makes many-to-many
    mistakes more readable."""

    newalign = []
    mistakes = ([],[])
    for align_item in alignment:
        if align_item[0] == align_item[1]:
            if mistakes != ([],[]):
                newalign.append((tuple(mistakes[0]), tuple(mistakes[1])))
                mistakes = ([],[])
            newalign.append((tuple([align_item[0]]), tuple([align_item[1]])))
        else:
            if align_item[0] != "":
                mistakes[0].append(align_item[0])
            if align_item[1] != "":
                mistakes[1].append(align_item[1])
    if mistakes != ([],[]):
        newalign.append((tuple(mistakes[0]), tuple(mistakes[1])))
        mistakes = ([],[])

    return newalign

[docs]def word_error_rate(ref: Sequence[T], hyp: Sequence[T]) -> float:
    """ Calculate the word error rate of a sequence against a reference.

    Args:
        ref: The gold-standard reference sequence
        hyp: The hypothesis to be evaluated against the reference.

    Returns:
        The word error rate of the supplied hypothesis with respect to the
        reference string.

    Raises:
        persephone.exceptions.EmptyReferenceException: If the length of the reference sequence is 0.

    """

    if len(ref) == 0:
        raise EmptyReferenceException(
            "Cannot calculating word error rate against a length 0 "\
            "reference sequence.")

    distance = min_edit_distance(ref, hyp)
    return 100 * float(distance) / len(ref)