import pprint
from collections import defaultdict
# This is a best approximation to what Bryan is trying to do.
# However the results are meaningless because the list is being
# mutated during iteration over it. So I haven't shown the output.
def extract_by_letters_0(letters, input_list):
dictionary = input_list.copy()
for word in dictionary:
for letter in letters:
if word.count(letter) != letters.count(letter):
if word in dictionary: #I cant leave this line out
dictionary.remove(word)
return dictionary
# This avoids the mutation.
# The results are anagrams PLUS letters that don't occur
# in the query. E.g. "same" produces "samehood" but not "sameness"
# ("sameness" has 3*"s" and 2*"e" instead of 1 of each)
def extract_by_letters_1(letters, input_list):
dictionary = set(input_list)
ripouts = set()
for word in dictionary:
for letter in letters:
if word.count(letter) != letters.count(letter):
ripouts.add(word)
return dictionary - ripouts
def anagram_key(strg):
return ''.join(sorted(list(strg)))
def check_anagrams(str1, str2):
return sorted(list(str1)) == sorted(list(str2))
# Advice: try algorithms like this out on a SMALL data set first.
# Get it working correctly. Use different test cases. Have test code
# however primitive that check your results.
# Then if it runs slowly, helpers
# don't have to guess what you are doing.
raw_text = """
twas brillig and the slithy toves
did gyre and gimble in the wabe
same mesa seam sameness samehood
"""
lexicon = sorted(set(raw_text.split()))
print "\nlexicon:", lexicon
#
# Assuming we want anagrams:
#
# Build an anagram dictionary
#
anagram_dict = defaultdict(set)
for word in lexicon:
anagram_dict[anagram_key(word)].add(word)
print "\nanagram_dict (len == %d):" % len(anagram_dict)
pprint.pprint(anagram_dict)
# now purge trivial entries
temp = {}
for k, v in anagram_dict.iteritems():
if len(v) != 1:
temp[k] = v
anagram_dict = temp
print "\nanagram_dict (len == %d):" % len(anagram_dict)
pprint.pprint(anagram_dict)
# Test cases
tests = "sam same mesa sameness samehood xsame samex".split()
default_set = frozenset()
for test in tests:
print
results = extract_by_letters_1(test, lexicon)
print test, [(result, check_anagrams(test, result)) for result in results]
# In the following statement, you can use set([test]) as the default
# if that produces a more useful or orthogonal result.
results = anagram_dict.get(anagram_key(test), default_set)
print test, [(result, check_anagrams(test, result)) for result in results]
Output:
lexicon: ['and', 'brillig', 'did', 'gimble', 'gyre', 'in', 'mesa', 'same', 'samehood', 'sameness', 'seam', 'slithy', 'the', 'toves', 'twas', 'wabe']
anagram_dict (len == 14):
defaultdict(<type 'set'>, {'abew': set(['wabe']), 'eht': set(['the']), 'egry': set(['gyre']), 'begilm': set(['gimble']), 'hilsty': set(['slithy']), 'aems': set(['mesa', 'seam', 'same']), 'bgiillr': set(['brillig']), 'ddi': set(['did']), 'eostv': set(['toves']), 'adehmoos': set(['samehood']), 'in': set(['in']), 'adn': set(['and']), 'aeemnsss': set(['sameness']), 'astw': set(['twas'])})
anagram_dict (len == 1):
{'aems': set(['mesa', 'same', 'seam'])}
sam [('mesa', False), ('samehood', False), ('seam', False), ('same', False)]
sam []
same [('mesa', True), ('samehood', False), ('seam', True), ('same', True)]
same [('mesa', True), ('seam', True), ('same', True)]
mesa [('mesa', True), ('samehood', False), ('seam', True), ('same', True)]
mesa [('mesa', True), ('seam', True), ('same', True)]
sameness [('sameness', True)]
sameness []
samehood [('samehood', True)]
samehood []
xsame []
xsame []
samex []
samex []