Do you use linux? I use these shell functions
# copyright by Werner Rudolph <werner (at) artistoex (dot) net>
# copying and distributing of the following source code
# is permitted, as long as this note is preserved.
# ftr CHAR1 CHAR2
# translate delimiter char in frequency list
#
ftr()
{
sed -r 's/^( *[0-9]+)'"$1"'/\1'"$2"'/'
}
# valid-collocations -- find valid collocations in inputstream
# reads records COUNT<SPC>COLLOCATION from inputstream
# writes records with existing collocations to stdout.
valid-collocations ()
{
#sort -k 2 -m - "$coll" |uniq -f 1 -D|accumulate
local delimiter="_"
ftr ' ' $delimiter |
join -t $delimiter -o 1.1 0 -1 2 -2 1 - /tmp/wordsets-helper-collocations |
ftr $delimiter ' '
}
# ngrams MAX [MIN]
#
# Generates all n-grams (for each MIN <= n <= MAX, where MIN defaults to 2)
# from inputstream
#
# reads word list, as generated by
#
# $ words < text
#
# from stdin. For each WORD in wordlist, it writes MAX-1 records
#
# COUNT<TAB>WORD<SPC>SUCC_1<SPC>
# COUNT<TAB>WORD<SPC>SUCC_1<SPC>SUCC_2<SPC>
# :
# COUNT<TAB>WORD<SPC>SUCC_1<SPC>SUCC_2<SPC>...<SPC>SUCC_MAX-2
# COUNT<TAB>WORD<SPC>SUCC_1<SPC>SUCC_2<SPC>...<SPC>SUCC_MAX-1
#
# to stdout, where word SUCC follows word WORD, and SUCC_n follows
# SUCC_n-1 in input stream COUNT times.
ngrams ()
{
local max=$1
local min=${2:-2};
awk 'FNR > 1 {print old " " $0} {old=$1}' | if (( $max - 1 > 1 )); then
if (( $min <= 2 )); then
tee >( ngrams $(( $max - 1 )) $(( $min - 1 )) );
else
ngrams $(( $max - 1 )) $(( $min - 1 ));
fi;
else
cat;
fi
}
words() {
grep -Eo '\<([a-zA-Z]'"'"'?){'${1:-3}',}\>'|grep -v "[A-Z]"
}
parse-collocations() {
local freq=${1:-0}
local length=${2:-4}
words | ngrams $length | sort | uniq -c |
awk '$1 > '"$freq"' { print $0; }' |
valid-collocations
}
Where parse-collocation
is the actual function to use. It accepts two optional parameters: The first sets the maximum recurring frequency of the terms to be skipped from the result (defaults to 0, i.e. consider all terms). The second parameter sets the maximum term length to search for. The function will read the text from stdin and print the terms to stdout line by line. It requires a dictionary file at /tmp/wordsets-helper-collocations
(download one here):
Usage example:
$ parse-collocation < some-text
would be pretty much what you want. However, if you don't want terms to be matched with a dictionary, you can use this one
$ words < some-text | ngrams 3 4 | sort | uniq -c |sort -nr
ngrams
's first parameter sets the minimum term length whereas its second (optional) parameter sets the maximum term length.