"""Statistical Language Processing tools. Not all working yet. (Chapter 23)"""
from utils import *
import random, bisect
class DiscreteProbDist:
"""A discrete probability distribution. If p is a member of the class and o
is an observed value, then there are 3 main operations:
p.add(o) increments the count for observation o by 1.
p.sample() returns a random element from the distribution.
p[o] returns the probability for o."""
def __init__(self):
update(self, dictionary=DefaultDict(0), needs_recompute=False,
table=[], n_obs=0)
def add(self, o):
"""Add an observation o to the distribution."""
self.dictionary[o] += 1
self.needs_recompute = True
def sample(self):
"""Return a random sample from the distribution."""
if self.needs_recompute: self._recompute()
if self.n_obs == 0:
return None
i = bisect.bisect_left(self.table, (1 + random.randrange(self.n_obs),))
(count, o) = self.table[i]
return o
def __getitem__(self, item):
"""Return an unsmoothed estimate of the probability of item."""
if self.needs_recompute: self._recompute()
return self.dictionary[item] / float(self.n_obs)
def __len__(self):
if self.needs_recompute: self._recompute()
return self.n_obs
def _recompute(self):
"""Recompute the total count n_obs and the table of entries."""
n_obs = 0
table = []
for (o, count) in self.dictionary.items():
n_obs += count
table.append((n_obs, o))
update(self, n_obs=n_obs, table=table, needs_recompute=False)
class UnigramTextModel(DiscreteProbDist):
"""This is a kind of discrete probability distribution, so you can
add, sample, or test for the probability of a word, just like with
DiscreteProbDist. You can also add an entire text (broken on whitespace)
with p.add_text('...'), and generate a random text with p.samples(n)"""
def add_text(self, text):
"""Create a unigram distribution from the words in the string text."""
for word in text.split():
self.add(word)
def samples(self, nwords):
"Return a string of nwords words, random according to the model.."
return ' '.join([self.sample() for i in range(nwords)])
class NgramTextModel(DiscreteProbDist):
"""This language model allows you to add n-grams, assess the
probability of an n-gram, and sample a random n-gram given an
(n-1)-gram."""
def __init__(self, n):
DiscreteProbDist.__init__(self, n)
if n > 1:
self.n_1_model = NgramTextModel(n-1)
## add, sample, __len__ methods inherited from DiscreteProbDist
## Note they deal with tuples, not strings, as inputs
def add(self, ngram):
self.ngram_model.add(ngram)
def add_text(self, text):
n, models = self.n, self.models
words = (n-1)*[''] + text.split()
for i in range(len(words)):
for ng in range(1, n+1):
models[ng].add(' '.join(words[i-n:i]))
def add_text(self, text):
words = text.split()
n = self.n
for i in range(len(words)):
self.unigram.add(words[i])
if i >= n-1:
self.ngram[tuple(words[i-n:i])].add(words[i])
self.ngram[0] ##???
def samples(self, nwords):
n = self.n
output = [self.unigram.sample() for i in range(n-1)]
for i in range(n-1, nwords):
sample = self.ngram[0].sample() #???
if sample == None:
sample = 0 ##???
output.append(sample)
return ' '.join(output)
def tops(file, n):
d = UnigramDist()
d.add_text(open(file).read())
items = [(v, k) for (k, v) in d.dictionary.items()]
items.sort(); items.reverse()
return items[0:n]
def viterbi_segment(text="itiseasy", P=None):
# best[i] = best probability for text[0:i]
# words[i] = best word ending at position i
n = len(text)
words = [''] + list(text)
best = [1.0] + [0.0] * n
## Fill in best, wordss
for i in range(n+1):
for j in range(0, i):
w = text[j:i]
#print ' ', i, j, repr(w), P[w]
if P[w] * best[i - len(w)] >= best[i]:
best[i] = P[w] * best[i - len(w)]
words[i] = w
#print 'best[%d]=%f; words[%d]=%r' % (i, best[i], i, words[i])
## Recover sequence of best words
sequence = []; i = len(words)-1
while i > 0:
sequence[0:0] = [words[i]]
i -= len(words[i])
## Return sequence of best words and overall probability
return sequence, best[-1]
#
Copyright:
Peter Norvig, 2002.
#
AIMA:
Python Code,
Example Output.
#
Python.org:
Tutorial,
Language Ref,
Libraries.