As an exercise, I wrote a simple implementation of Markov Chains in MATLAB. Basically its a letter-based probabilistic model to generating words.
function mc = MC(numStates)
N = numStates;
PI = ones(1,N)/N;
TR = ones(N,N)/N;
mc = struct('logprob',@logprob, 'train',@train, ...
'sample',@sample, 'sampleFiltered',@sampleFiltered);
function train(seqs)
PI = 1 + histc(cellfun(@(c)c(1), seqs)', 1:N); %#'
TR = ones(N,N);
for i=1:numel(seqs)
ind = sub2ind([N N], seqs{i}(1:end-1), seqs{i}(2:end));
TR = TR + reshape(histc(ind,1:N*N), [N N]);
end
PI = bsxfun(@rdivide, PI, sum(PI,2)+(sum(PI,2)==0));
TR = bsxfun(@rdivide, TR, sum(TR,2)+(sum(TR,2)==0));
end
function seq = sample(len)
seq = zeros(1,len);
seq(1) = randsample(1:N, 1, true, PI);
for t=2:len
seq(t) = randsample(1:N, 1, true, TR(seq(t-1),:));
end
end
function seq = sampleFiltered(allowed)
len = numel(allowed);
seq = zeros(1,len);
seq(1) = randsample(allowed, 1, true, PI(allowed));
allowed( find(allowed==seq(1),1,'first') ) = [];
for t=2:len-1
seq(t) = randsample(allowed, 1, true, TR(seq(t-1),allowed));
allowed( find(allowed==seq(t),1,'first') ) = [];
end
seq(t) = allowed;
seq = seq(seq~=0);
end
function LL = logprob(seq)
LL = log(PI(seq(1))) + ...
sum( log(TR(sub2ind([N N],seq(1:end-1),seq(2:end)))) );
end
end
We will need some text to train the model. We use 'The Wonderful Wizard of Oz' from Project Gutenberg.
%# read the text document
str = lower( urlread('http://www.gutenberg.org/files/55/55.txt') );
SP = char(32); %# delimiter (space)
str( ~isstrprop(str, 'alpha') ) = SP; %# replace non-letters with spaces
str( findstr(str, [SP SP]) ) = []; %# consecutive spaces as one
idx = ( str == SP ); %# location of spaces
df = diff([1 idx 1]);
len = find(df > 0) - find(df < 0); %# length of each word
[seqs gn] = grp2idx( str(~idx)' ); %#' map letters to numbers starting from 1
seqs = mat2cell(seqs', 1, len)'; %# put each word in a separate cell
N = length(gn); %# A to Z
Finally, we use the model to either sample random words or sample words from a set of letters:
%# train Markov chain
mc = MC(N);
mc.train(seqs);
%# sample a random word
seq = mc.sample( randi([3 10]) );
fprintf('word = %s , logP(word)=%f\n', [gn{seq}], mc.logprob(seq))
%# sample a word from a set of letters
letters = lower('markovchains');
lettersIdx = cellfun(@(c) find(strcmp(c,gn)), cellstr(letters')); %#'
seq = mc.sampleFiltered(lettersIdx);
fprintf('word = %s , logP(word)=%f\n', [gn{seq}], mc.logprob(seq))
Here's a bunch of examples generated from the letters 'markovchains', along with log-probability of the word given the model:
word = mivorancask , logP(word)=-29.610819
word = arknoamshiv , logP(word)=-32.496090
word = ancoramshik , logP(word)=-29.299897
word = orchisankav , logP(word)=-29.987204
word = avinchasorm , logP(word)=-27.178507
word = aronchaskim , logP(word)=-25.651964
You can see that although none are correct words, they are still better than just a random sequence of letters. Obviously using only the previous character to generate the next one is not enough, still it can be easily extended to more sophisticated cases (N-gram).
The nice thing about such an approach is that its not restricted to one language, and can be adapted to any other by simply feeding it documents from your language of choice.