3 # Copyright (C) 2003 by Martin Pool <mbp@sourcefrog.net>
5 # Given a dictionary of words, find the longest word that can be grown
6 # from a three-letter seed by successively adding one letter, then
7 # rearranging. All the intermediate stages must occur in the
10 # The solution is determined by the dictionary and the seed.
12 # There may be more than one equally long solution.
14 # Since we rearrange the word after each addition we just need to
15 # remember the population count of letters, not the particular word.
17 # Perhaps we could use a Python 26-tuple giving the count of each
20 # It seems like a greedy algorithm might work here...
22 # Algorithm: We hold a list of known 'reachable' words. Initialize
23 # the list of good words with the seed word. Read the dictionary and
24 # squash case and discard non-letter symbols. Iterate through the
25 # dictionary, considering the words in order of size.
27 # For each word in the dictionary, it is reachable if it can be made
28 # by adding a letter to any shorter reachable word, and then
29 # rearranging. Therefore it does not matter what order we process the
30 # words, as long as we do them in strictly ascending order by length.
32 # How do we efficiently find out if it is reachable? A
33 # straightforward way is to try removing each unique letter from the
34 # word, and see if that population is reachable. In other words, we
35 # turn the word into its population, and then generate the set of
36 # populations that are decreased by one in every non-zero count.
38 # So "aabd" has the population (2, 1, 0, 1). This is reachable if we
39 # have (1, 1, 0, 1), or (2, 0, 0, 1) or (2, 1, 0, 0).
41 # So for each word, we need to check a number of possible predecessors
42 # limited to the number of unique letters in the word; at most 26. If
43 # we store the reachable words in a hashtable indexed by population,
44 # we can do the probes in O(1) time.
46 # Therefore we can do the whole thing in time proportional to the
47 # number of words, O(n). I think this is optimal, since we have to at
48 # the very least read in each word and therefore cannot be quicker
51 # In the first version I read all the words into a single list and
52 # then sorted it by length. Doing the sort by length is quite slow in
53 # Python and in addition this is at best O(n log n). Doing a complete
54 # sort is a lot of work when we require just a very rough sort by
55 # size. So instead we read the words into buckets by length, then
58 # The bucket thing may be a bit more than O(n), because of the
59 # possible cost of copying things as we join the buckets. But it is
60 # not very expensive compared to the actual search. There are a
61 # fairly small number of buckets anyhow -- just proportional to the
66 # Seed is given as the command line argument; words are read from
67 # stdin. This solution doesn't require the seed to be three letters.
69 # A good way to feed it is from
71 # grep '^[a-z]*$' /usr/share/dict/words
77 from pprint import pprint
78 from xreadlines import xreadlines
82 letters_re = re.compile('[a-z]*')
85 """Read from stdin onto all_words"""
86 # indexed by length; contents is a list of words of that length
89 # XXX: You'll get a deprecation warning here for Python 2.3. I just use
90 # xreadlines for the benefit of old machines.
92 for w in xreadlines(sys.stdin):
96 # check chars are reasonable
97 if not letters_re.match(w):
103 # Put it into the right bucket for its length. Make a new
111 # Now join up all the buckets so that we have one big list, sorted by
117 all_words.extend(by_len[l])
123 # Make a 26-tuple giving the count of each letter in a word
125 for l in string.ascii_lowercase:
126 pop.append(w.count(l))
130 """Given a population p, return a sequence of populations that are
131 reduced by one character from p."""
133 for i in range(len(p)):
135 # Copy it and reduce the i'th element. The extra comma is
137 reduced = (p[i] - 1),
138 p2 = p[:i] + reduced + p[i+1:]
142 def note_reachable(reachable, w):
143 reachable[make_pop(w)] = w
147 all_words = read_words()
149 ## all_words.sort(lambda x, y: cmp(len(x), len(y)))
152 # reachable is a map from population tuples to a valid word with
156 # start off with just the seed
157 note_reachable(reachable, seed)
161 if reachable.has_key(w_pop):
162 # already have an anagram of w
165 # for each possible predecessor of w, see if it is already reachable
166 for v_pop in make_reduced(w_pop):
167 if reachable.has_key(v_pop):
168 print "%s reachable from %s" % (w, reachable[v_pop])
169 note_reachable(reachable, w)
173 if __name__ == '__main__':
175 # Cool! Psyco roughly halves the runtime.
177 psyco.log('/tmp/psyco.log')
180 print 'Psyco not found, ignoring it'