The following Python code will generate a random sentence with the given number of terminals.
It works by counting the number of ways to produce a sentence of a given length, generating a large random number, and computing the indicated sentence.
The count is done recursively, with memoization.
An empty right hand side produces 1 sentence if n is 0 and 0 sentences otherwise.
To count the number of sentences produced by a nonempty right hand side, sum over i, the number of terminals used by the first symbol in the right hand side.
For each i, multiply the number of possibilities for the rest of the right hand side by the number of possibilities for the first symbol.
If the first symbol is a terminal, there is 1 possibility if i is 1 and 0 otherwise.
If the first symbol is a nonterminal, sum the possibilities over each alternative.
To avoid infinite loops, we have to be careful to prune the recursive calls when a quantity is 0.
This may still loop infinitely if the grammar has infinitely many derivations of one sentence.
For example, in the grammar
S -> S S
S ->
there are infinitely many derivations of the empty sentence: S => , S => S S => , S => S S => S S S => , etc.
The code to find a particular sentence is a straightforward modification of the code to count them.
This code is reasonably efficient, generating 100 sentences with 100 terminals each in less than a second.
import collections
import random
class Grammar:
def __init__(self):
self.prods = collections.defaultdict(list)
self.numsent = {}
self.weight = {}
def prod(self, lhs, *rhs):
self.prods[lhs].append(rhs)
self.numsent.clear()
def countsent(self, rhs, n):
if n < 0:
return 0
elif not rhs:
return 1 if n == 0 else 0
args = (rhs, n)
if args not in self.numsent:
sym = rhs[0]
rest = rhs[1:]
total = 0
if sym in self.prods:
for i in xrange(1, n + 1):
numrest = self.countsent(rest, n - i)
if numrest > 0:
for rhs1 in self.prods[sym]:
total += self.countsent(rhs1, i) * numrest
else:
total += self.countsent(rest, n - self.weight.get(sym, 1))
self.numsent[args] = total
return self.numsent[args]
def getsent(self, rhs, n, j):
assert 0 <= j < self.countsent(rhs, n)
if not rhs:
return ()
sym = rhs[0]
rest = rhs[1:]
if sym in self.prods:
for i in xrange(1, n + 1):
numrest = self.countsent(rest, n - i)
if numrest > 0:
for rhs1 in self.prods[sym]:
dj = self.countsent(rhs1, i) * numrest
if dj > j:
j1, j2 = divmod(j, numrest)
return self.getsent(rhs1, i, j1) + self.getsent(rest, n - i, j2)
j -= dj
assert False
else:
return (sym,) + self.getsent(rest, n - self.weight.get(sym, 1), j)
def randsent(self, sym, n):
return self.getsent((sym,), n, random.randrange(self.countsent((sym,), n)))
if __name__ == '__main__':
g = Grammar()
g.prod('S', 'NP', 'VP')
g.prod('S', 'S', 'and', 'S')
g.prod('S', 'S', 'after', 'which', 'S')
g.prod('NP', 'the', 'N')
g.prod('NP', 'the', 'A', 'N')
g.prod('NP', 'the', 'A', 'A', 'N')
g.prod('VP', 'V', 'NP')
g.prod('N', 'dog')
g.prod('N', 'fish')
g.prod('N', 'bird')
g.prod('N', 'wizard')
g.prod('V', 'kicks')
g.prod('V', 'meets')
g.prod('V', 'marries')
g.prod('A', 'red')
g.prod('A', 'striped')
g.prod('A', 'spotted')
g.weight.update({'and': 0, 'after': 0, 'which': 0, 'the': 0})
for i in xrange(100):
print ' '.join(g.randsent('S', 3))