text Module (text.py)


"""Statistical Language Processing tools.  Not all working yet. (Chapter 23)"""

from utils import *
import random, bisect

class DiscreteProbDist:
    """A discrete probability distribution. If p is a member of the class and o
    is an observed value, then there are 3 main operations:
        p.add(o) increments the count for observation o by 1.
        p.sample() returns a random element from the distribution.
        p[o] returns the probability for o."""

    def __init__(self):
        update(self, dictionary=DefaultDict(0), needs_recompute=False,
               table=[], n_obs=0)

    def add(self, o):
        """Add an observation o to the distribution."""
        self.dictionary[o] += 1
        self.needs_recompute = True

    def sample(self):
        """Return a random sample from the distribution."""
        if self.needs_recompute: self._recompute()
        if self.n_obs == 0:
            return None
        i = bisect.bisect_left(self.table, (1 + random.randrange(self.n_obs),))
        (count, o) = self.table[i]
        return o

    def __getitem__(self, item):
        """Return an unsmoothed estimate of the probability of item."""
        if self.needs_recompute: self._recompute()
        return self.dictionary[item] / float(self.n_obs)

    def __len__(self):
        if self.needs_recompute: self._recompute()
        return self.n_obs

    def _recompute(self):
        """Recompute the total count n_obs and the table of entries."""
        n_obs = 0
        table = []
        for (o, count) in self.dictionary.items():
            n_obs += count
            table.append((n_obs, o))
        update(self, n_obs=n_obs, table=table, needs_recompute=False)

class UnigramTextModel(DiscreteProbDist):
    """This is a kind of discrete probability distribution, so you can
    add, sample, or test for the probability of a word, just like with
    DiscreteProbDist.  You can also add an entire text (broken on whitespace)
    with p.add_text('...'), and generate a random text with p.samples(n)"""

    def add_text(self, text):
        """Create a unigram distribution from the words in the string text."""
        for word in text.split():
            self.add(word)

    def samples(self, nwords):
        "Return a string of nwords words, random according to the model.."
        return ' '.join([self.sample() for i in range(nwords)])

class NgramTextModel(DiscreteProbDist):
    """This language model allows you to add n-grams, assess the
    probability of an n-gram, and sample a random n-gram given an
    (n-1)-gram."""
    def __init__(self, n):
        DiscreteProbDist.__init__(self, n)
        if n > 1:
            self.n_1_model = NgramTextModel(n-1)

    ## add, sample, __len__ methods inherited from DiscreteProbDist
    ## Note they deal with tuples, not strings, as inputs

    def add(self, ngram):
        self.ngram_model.add(ngram)



    def add_text(self, text):
        n, models = self.n, self.models
        words = (n-1)*[''] + text.split()
        for i in range(len(words)):
            for ng in range(1, n+1):
                models[ng].add(' '.join(words[i-n:i]))

    def add_text(self, text):
        words = text.split()
        n = self.n
        for i in range(len(words)):
            self.unigram.add(words[i])
            if i >= n-1:
                self.ngram[tuple(words[i-n:i])].add(words[i])
            self.ngram[0]             ##???

    def samples(self, nwords):
        n = self.n
        output = [self.unigram.sample() for i in range(n-1)]
        for i in range(n-1, nwords):
            sample = self.ngram[0].sample() #???
            if sample == None:
                sample = 0 ##???
            output.append(sample)
        return ' '.join(output)


def tops(file, n):
    d = UnigramDist()
    d.add_text(open(file).read())
    items = [(v, k) for (k, v) in d.dictionary.items()]
    items.sort(); items.reverse()
    return items[0:n]


def viterbi_segment(text="itiseasy", P=None):
    # best[i] = best probability for text[0:i]
    # words[i] = best word ending at position i
    n = len(text)
    words = [''] + list(text)
    best = [1.0] + [0.0] * n
    ## Fill in best, wordss
    for i in range(n+1):
        for j in range(0, i):
            w = text[j:i]
            #print '   ', i, j, repr(w), P[w]
            if P[w] * best[i - len(w)] >= best[i]:
                best[i] = P[w] * best[i - len(w)]
                words[i] = w
                #print 'best[%d]=%f; words[%d]=%r' % (i, best[i], i, words[i])
    ## Recover sequence of best words
    sequence = []; i = len(words)-1
    while i > 0:
        sequence[0:0] = [words[i]]
        i -= len(words[i])
    ## Return sequence of best words and overall probability
    return sequence, best[-1]



# Copyright: Peter Norvig, 2002.
# AIMA: Python Code, Example Output.
# Python.org: Tutorial, Language Ref, Libraries.