growwords.py

   1 #! /usr/bin/python
   2
   3 # Copyright (C) 2003 by Martin Pool <mbp@sourcefrog.net>
   4
   5 # Given a dictionary of words, find the longest word that can be grown
   6 # from a three-letter seed by successively adding one letter, then
   7 # rearranging.  All the intermediate stages must occur in the
   8 # dictionary.
   9
  10 # The solution is determined by the dictionary and the seed.
  11
  12 # There may be more than one equally long solution.
  13
  14 # Since we rearrange the word after each addition we just need to
  15 # remember the population count of letters, not the particular word.
  16
  17 # Perhaps we could use a Python 26-tuple giving the count of each
  18 # letter as a key...
  19
  20 # It seems like a greedy algorithm might work here...
  21
  22 # Algorithm: We hold a list of known 'reachable' words.  Initialize
  23 # the list of good words with the seed word.  Read the dictionary and
  24 # squash case and discard non-letter symbols.  Iterate through the
  25 # dictionary, considering the words in order of size.
  26
  27 # For each word in the dictionary, it is reachable if it can be made
  28 # by adding a letter to any shorter reachable word, and then
  29 # rearranging.  Therefore it does not matter what order we process the
  30 # words, as long as we do them in strictly ascending order by length.
  31
  32 # How do we efficiently find out if it is reachable?  A
  33 # straightforward way is to try removing each unique letter from the
  34 # word, and see if that population is reachable.  In other words, we
  35 # turn the word into its population, and then generate the set of
  36 # populations that are decreased by one in every non-zero count.
  37
  38 # So "aabd" has the population (2, 1, 0, 1).  This is reachable if we
  39 # have (1, 1, 0, 1), or (2, 0, 0, 1) or (2, 1, 0, 0).
  40
  41 # So for each word, we need to check a number of possible predecessors
  42 # limited to the number of unique letters in the word; at most 26.  If
  43 # we store the reachable words in a hashtable indexed by population,
  44 # we can do the probes in O(1) time.
  45
  46 # Therefore we can do the whole thing in time proportional to the
  47 # number of words, O(n).  I think this is optimal, since we have to at
  48 # the very least read in each word and therefore cannot be quicker
  49 # than O(n).
  50
  51 # In the first version I read all the words into a single list and
  52 # then sorted it by length.  Doing the sort by length is quite slow in
  53 # Python and in addition this is at best O(n log n).  Doing a complete
  54 # sort is a lot of work when we require just a very rough sort by
  55 # size.  So instead we read the words into buckets by length, then
  56 # join them up.
  57
  58 # The bucket thing may be a bit more than O(n), because of the
  59 # possible cost of copying things as we join the buckets.  But it is
  60 # not very expensive compared to the actual search.  There are a
  61 # fairly small number of buckets anyhow -- just proportional to the
  62 # longest word.
  63
  64 # Solution:
  65
  66 # Seed is given as the command line argument; words are read from
  67 # stdin.  This solution doesn't require the seed to be three letters.
  68
  69 # A good way to feed it is from
  70
  71 #   grep '^[a-z]*$' /usr/share/dict/words
  72
  73
  74
  75
  76 import sys, string
  77 from pprint import pprint
  78 from xreadlines import xreadlines
  79 import re
  80 import profile
  81
  82 letters_re = re.compile('[a-z]*')
  83
  84 def read_words():
  85     """Read from stdin onto all_words"""
  86     # indexed by length; contents is a list of words of that length
  87     by_len = {}
  88
  89     # XXX: You'll get a deprecation warning here for Python 2.3.  I just use
  90     # xreadlines for the benefit of old machines.
  91
  92     for w in xreadlines(sys.stdin):
  93         if w[-1] == '\n':
  94             w = w[:-1]                  # chomp
  95
  96         # check chars are reasonable
  97         if not letters_re.match(w):
  98             raise ValueError()
  99
 100         w = w.lower()
 101         l = len(w)
 102
 103         # Put it into the right bucket for its length.  Make a new
 104         # one if needed.
 105         wl = by_len.get(l)
 106         if wl is None:
 107             wl = []
 108             by_len[l] = wl
 109         wl.append(w)
 110
 111     # Now join up all the buckets so that we have one big list, sorted by
 112     # word length
 113     all_words = []
 114     lens = by_len.keys()
 115     lens.sort()
 116     for l in lens:
 117         all_words.extend(by_len[l])
 118
 119     return all_words
 120
 121
 122 def make_pop(w):
 123     # Make a 26-tuple giving the count of each letter in a word
 124     pop = []
 125     for l in string.ascii_lowercase:
 126         pop.append(w.count(l))
 127     return tuple(pop)
 128
 129 def make_reduced(p):
 130     """Given a population p, return a sequence of populations that are
 131     reduced by one character from p."""
 132     r = []
 133     for i in range(len(p)):
 134         if p[i] > 0:
 135             # Copy it and reduce the i'th element.  The extra comma is
 136             # to form a tuple.
 137             reduced = (p[i] - 1),
 138             p2 = p[:i] + reduced + p[i+1:]
 139             r.append(p2)
 140     return r
 141
 142 def note_reachable(reachable, w):
 143     reachable[make_pop(w)] = w
 144
 145 def main():
 146     seed = sys.argv[1]
 147     all_words = read_words()
 148
 149     ## all_words.sort(lambda x, y: cmp(len(x), len(y)))
 150     ## pprint(all_words)
 151
 152     # reachable is a map from population tuples to a valid word with
 153     # that population
 154     reachable = {}
 155
 156     # start off with just the seed
 157     note_reachable(reachable, seed)
 158
 159     for w in all_words:
 160         w_pop = make_pop(w)
 161         if reachable.has_key(w_pop):
 162             # already have an anagram of w
 163             continue
 164
 165         # for each possible predecessor of w, see if it is already reachable
 166         for v_pop in make_reduced(w_pop):
 167             if reachable.has_key(v_pop):
 168                 print "%s reachable from %s" % (w, reachable[v_pop])
 169                 note_reachable(reachable, w)
 170                 break
 171
 172
 173 if __name__ == '__main__':
 174     try:
 175         # Cool!  Psyco roughly halves the runtime.
 176         import psyco
 177         psyco.log('/tmp/psyco.log')
 178         psyco.full()
 179     except:
 180         print 'Psyco not found, ignoring it'
 181     main()