Changeset 189
- Timestamp:
- 08/16/08 23:14:08 (3 months ago)
- Files:
-
- trunk/shakespeare/cli.py (modified) (1 diff)
- trunk/shakespeare/search.py (modified) (1 diff)
- trunk/shakespeare/tests/functional/test_search.py (modified) (1 diff)
- trunk/shakespeare/tests/test_search.py (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/shakespeare/cli.py
Revision 169 Revision 189 1 #!/usr/bin/env python 1 #!/usr/bin/env python 2 2 3 import cmd 3 import cmd 4 import os 4 import os 5 import StringIO 5 import StringIO 6 6 7 class ShakespeareAdmin(cmd.Cmd): 7 class ShakespeareAdmin(cmd.Cmd): 8 """ 8 """ 9 TODO: self.verbose option and associated self._print 9 TODO: self.verbose option and associated self._print 10 """ 10 """ 11 11 12 def __init__(self, verbose=False): 12 def __init__(self, verbose=False): 13 # cmd.Cmd is not a new style class 13 # cmd.Cmd is not a new style class 14 cmd.Cmd.__init__(self) 14 cmd.Cmd.__init__(self) 15 self.verbose = verbose 15 self.verbose = verbose 16 16 17 prompt = 'The Bard > ' 17 prompt = 'The Bard > ' 18 18 19 def run_interactive(self, line=None): 19 def run_interactive(self, line=None): 20 """Run an interactive session. 20 """Run an interactive session. 21 """ 21 """ 22 print 'Welcome to shakespeare-admin interactive mode\n' 22 print 'Welcome to shakespeare-admin interactive mode\n' 23 self.do_about() 23 self.do_about() 24 print 'Type: "?" or "help" for help on commands.\n' 24 print 'Type: "?" or "help" for help on commands.\n' 25 while 1: 25 while 1: 26 try: 26 try: 27 self.cmdloop() 27 self.cmdloop() 28 break 28 break 29 except KeyboardInterrupt: 29 except KeyboardInterrupt: 30 raise 30 raise 31 31 32 def do_help(self, line=None): 32 def do_help(self, line=None): 33 cmd.Cmd.do_help(self, line) 33 cmd.Cmd.do_help(self, line) 34 34 35 def do_about(self, line=None): 35 def do_about(self, line=None): 36 import shakespeare 36 import shakespeare 37 version = shakespeare.__version__ 37 version = shakespeare.__version__ 38 about = \ 38 about = \ 39 '''Open Shakespeare version %s. Copyright the Open Knowledge Foundation. 39 '''Open Shakespeare version %s. Copyright the Open Knowledge Foundation. 40 Open Shakespeare is open-knowledge and open-source. See COPYING for details. 40 Open Shakespeare is open-knowledge and open-source. See COPYING for details. 41 41 42 For more information about the package run `info`. 42 For more information about the package run `info`. 43 ''' % version 43 ''' % version 44 print about 44 print about 45 45 46 def do_quit(self, line=None): 46 def do_quit(self, line=None): 47 sys.exit() 47 sys.exit() 48 48 49 def do_EOF(self, *args): 49 def do_EOF(self, *args): 50 print '' 50 print '' 51 sys.exit() 51 sys.exit() 52 52 53 # ================= 53 # ================= 54 # Commands 54 # Commands 55 55 56 def do_db(self, line=None): 56 def do_db(self, line=None): 57 actions = [ 'create', 'clean', 'rebuild', 'init' ] 57 actions = [ 'create', 'clean', 'rebuild', 'init' ] 58 if line is None or line not in actions: 58 if line is None or line not in actions: 59 self.help_db() 59 self.help_db() 60 return 1 60 return 1 61 import shakespeare.model 61 import shakespeare.model 62 if line == 'init': 62 if line == 'init': 63 import pkg_resources 63 import pkg_resources 64 pkg = 'shksprdata' 64 pkg = 'shksprdata' 65 meta = pkg_resources.resource_stream(pkg, 'texts/metadata.txt') 65 meta = pkg_resources.resource_stream(pkg, 'texts/metadata.txt') 66 shakespeare.model.Material.load_from_metadata(meta) 66 shakespeare.model.Material.load_from_metadata(meta) 67 else: 67 else: 68 shakespeare.model.__dict__[line+'db']()68 print 'To create db use paster: paster setup-app {config-file}' 69 69 70 def help_db(self, line=None): 70 def help_db(self, line=None): 71 usage = \ 71 usage = \ 72 '''db { create | clean | rebuild |init }72 '''db { create | init } 73 ''' 73 ''' 74 print usage 74 print usage 75 75 76 def do_gutenberg(self, line=None): 76 def do_gutenberg(self, line=None): 77 import shakespeare.gutenberg 77 import shakespeare.gutenberg 78 helper = shakespeare.gutenberg.Helper(verbose=True) 78 helper = shakespeare.gutenberg.Helper(verbose=True) 79 if not line: 79 if not line: 80 helper.execute() 80 helper.execute() 81 elif line == 'print_index': 81 elif line == 'print_index': 82 import pprint 82 import pprint 83 pprint.pprint(helper.get_index()) 83 pprint.pprint(helper.get_index()) 84 else: 84 else: 85 msg = 'Unknown argument %s' % line 85 msg = 'Unknown argument %s' % line 86 raise Exception(msg) 86 raise Exception(msg) 87 87 88 def help_gutenberg(self, line=None): 88 def help_gutenberg(self, line=None): 89 usage = \ 89 usage = \ 90 """ 90 """ 91 Download and process all Project Gutenberg shakespeare texts""" 91 Download and process all Project Gutenberg shakespeare texts""" 92 print usage 92 print usage 93 93 94 def do_moby(self, line=None): 94 def do_moby(self, line=None): 95 import shakespeare.moby 95 import shakespeare.moby 96 helper = shakespeare.moby.Helper(verbose=True) 96 helper = shakespeare.moby.Helper(verbose=True) 97 if not line: 97 if not line: 98 helper.execute() 98 helper.execute() 99 elif line == 'print_index': 99 elif line == 'print_index': 100 import pprint 100 import pprint 101 pprint.pprint(helper.get_index()) 101 pprint.pprint(helper.get_index()) 102 else: 102 else: 103 msg = 'Unknown argument %s' % line 103 msg = 'Unknown argument %s' % line 104 raise Exception(msg) 104 raise Exception(msg) 105 105 106 def help_moby(self, line=None): 106 def help_moby(self, line=None): 107 usage = \ 107 usage = \ 108 ''' 108 ''' 109 Download and process all Moby/Bosak shakespeare texts''' 109 Download and process all Moby/Bosak shakespeare texts''' 110 print usage 110 print usage 111 111 112 def _init_index(self): 112 def _init_index(self): 113 import shakespeare.index 113 import shakespeare.index 114 self._index = shakespeare.index.all 114 self._index = shakespeare.index.all 115 115 116 def _filter_index(self, line): 116 def _filter_index(self, line): 117 """Filter items in index return only those whose id (url) is in line 117 """Filter items in index return only those whose id (url) is in line 118 If line is empty or None return all items 118 If line is empty or None return all items 119 """ 119 """ 120 if line: 120 if line: 121 textsToAdd = [] 121 textsToAdd = [] 122 textNames = line.split() 122 textNames = line.split() 123 for item in self._index: 123 for item in self._index: 124 if item.name in textNames: 124 if item.name in textNames: 125 textsToAdd.append(item) 125 textsToAdd.append(item) 126 return textsToAdd 126 return textsToAdd 127 else: 127 else: 128 self._init_index() 128 self._init_index() 129 return self._index 129 return self._index 130 130 131 def do_index(self, line): 131 def do_index(self, line): 132 self._init_index() 132 self._init_index() 133 header = \ 133 header = \ 134 ''' +-------------------+ 134 ''' +-------------------+ 135 | Index of Material | 135 | Index of Material | 136 +-------------------+ 136 +-------------------+ 137 137 138 ''' 138 ''' 139 print header 139 print header 140 for row in self._index: 140 for row in self._index: 141 print row.name.ljust(35), row.title 141 print row.name.ljust(35), row.title 142 142 143 def help_index(self, line=None): 143 def help_index(self, line=None): 144 usage = \ 144 usage = \ 145 '''Print index of Shakespeare texts to stdout''' 145 '''Print index of Shakespeare texts to stdout''' 146 print usage 146 print usage 147 147 148 def do_concordance(self, line=None):149 self._init_index()150 print 'Making concordance (this may take some time ...):'151 from shakespeare.concordance import ConcordanceBuilder152 import time153 start = end = 0154 start = time.time()155 cc = ConcordanceBuilder()156 textsToAdd = []157 if line is not None:158 textsToAdd = self._filter_index(line)159 else:160 def gut_non_folio(material):161 return '_gut' in material.name and 'gut_f' not in material.name162 textsToAdd = filter(gut_non_folio, self._index)163 for item in textsToAdd:164 print 'Adding: %s (%s)' % (item.name, item.title)165 cc.add_text(item.name)166 end = time.time()167 timetaken = end - start168 print 'Finished. Time taken was %ss' % timetaken169 170 def help_concordance(self, line=None):171 usage = \172 '''Create a concordance173 174 If no arguments supplied then use all non-folio gutenberg shakespeare texts.175 Otherwise arguments should be a space seperated list of work name ids176 '''177 print usage178 179 def do_runserver(self, line=None): 148 def do_runserver(self, line=None): 180 self.help_runserver() 149 self.help_runserver() 181 150 182 def help_runserver(self, line=None): 151 def help_runserver(self, line=None): 183 usage = \ 152 usage = \ 184 '''This command has been DEPRECATED. 153 '''This command has been DEPRECATED. 185 154 186 Please use `paster serve` to run a server now, e.g.:: 155 Please use `paster serve` to run a server now, e.g.:: 187 156 188 paster serve <my-config.ini> 157 paster serve <my-config.ini> 189 ''' 158 ''' 190 print usage 159 print usage 191 160 192 def do_info(self, line=None): 161 def do_info(self, line=None): 193 import shakespeare 162 import shakespeare 194 info = shakespeare.__doc__ 163 info = shakespeare.__doc__ 195 print 164 print 196 print ' ## Open Shakespeare ##' 165 print ' ## Open Shakespeare ##' 197 print info 166 print info 198 167 199 def help_info(self, line=None): 168 def help_info(self, line=None): 200 print 'Information about this package.' 169 print 'Information about this package.' 201 170 202 def do_search_add(self, line=None): 171 def _parse_line(self, line): 203 path = line.strip() 172 line = line.strip() 204 if not os.path.exists(path): 173 args = line.split() 205 print '"%s" is not an existent path' % path 174 action = '' 206 return 1 175 remainder = '' 207 if os.path.isdir(path): 176 if len(args) > 0: 208 fns = os.listdir(path) 177 action = args[0] 209 fns = filter(lambda x: x.endswith('.txt'), fns) 178 if len(args) > 1: 210 works = [ os.path.join(path, fn) for fn in fns ] 179 remainder = ' '.join(args[1:]) 211 else: 180 return (action, remainder) 212 works = [ path ] 181 182 def do_search(self, line): 213 import shakespeare.search 183 import shakespeare.search 214 index = shakespeare.search.SearchIndex.default_index() 184 index = shakespeare.search.SearchIndex.default_index() 215 for work in works: 185 216 if self.verbose: 186 action, extra = self._parse_line(line) 217 print 'Processing %s' % work 187 if action == 'addpath': 218 fileobj = open(work) 188 index.add_from_path(extra) 189 elif action == 'query': 190 results = index.search(extra) 191 print index.print_matches(results) 192 elif action == 'addtext': 193 import shakespeare.model as model 194 text = model.Material.byName(extra) 195 fileobj = text.get_text() 219 index.add_item(fileobj) 196 index.add_item(fileobj) 220 197 elif action == 'init': 221 def help_search_add(self, line=None): 198 self._init_index() 222 info = '''search_add {path} 199 for text in self._index: 223 200 fileobj = text.get_text() 224 Add contents of {path} (file itself or all text files in directory if 201 index.add_item(fileobj) 225 directory) to the search index.''' 202 else: 203 print 'Unrecognized action: %s' % action 204 self.help_search() 205 return 1 206 207 def help_search(self, line=None): 208 info = \ 209 ''' 210 search addpath {path} 211 - Add contents of {path} (file itself or all text files in directory if 212 directory) to the search index. 213 214 search addtext {name} 215 - Add db text named {name} to search index. 216 217 search query {query} 218 - Query search index with {query}. 219 220 search init 221 - Add all texts in DB to index. 222 ''' 226 print info 223 print info 227 224 228 def do_search_add_all(self): 225 def do_stats(self, line): 229 # TODO: automatically add all texts listed in index 226 action, extra = self._parse_line(line) 230 pass 227 231 228 import shakespeare.stats 232 def do_search(self, line=None): 229 stats = shakespeare.stats.Stats() 233 import shakespeare.search 230 if action == 'init': 234 index = shakespeare.search.SearchIndex.default_index() 231 self._init_index() 235 query = line.strip() 232 for text in self._index: 236 if not query: 233 stats.statsify(text, text.get_text()) 237 print 'No search term supplied.' 234 elif action == 'addtext': 235 import shakespeare.model as model 236 text = model.Material.byName(extra) 237 stats.statsify(text, text.get_text()) 238 elif action == 'show': 239 textstats = stats.text_stats(extra) 240 for s in textstats: 241 print s.word, s.freq 242 else: 243 print 'Unrecognized action: %s' % action 244 self.help_stats() 238 return 1 245 return 1 239 matches = index.search(query) 246 240 print "%i results found." % matches.get_matches_estimated() 247 def help_stats(self, line=None): 241 print "Results 1-%i:" % matches.size() 248 info = \ 242 249 ''' 243 for m in matches: 250 stats addtext {name} 244 print 251 - Add db text named {name} to stats index. 245 print '%i: %i%% docid=%i' % (m.rank + 1, m.percent, m.docid) 252 246 print m.document.get_data() 253 stats show {name} 247 254 - Query stats index with {query}. 248 def help_search(self, line=None): 255 249 info = 'Supply a query with which to search the search index.' 256 stats init 257 - Prepare statistics for all texts in DB. 258 ''' 250 print info 259 print info 260 251 261 252 def main(): 262 def main(): 253 import optparse 263 import optparse 254 usage = \ 264 usage = \ 255 '''%prog [options] <command> 265 '''%prog [options] <command> 256 266 257 Run about or help for details.''' 267 Run about or help for details.''' 258 parser = optparse.OptionParser(usage) 268 parser = optparse.OptionParser(usage) 259 parser.add_option('-v', '--verbose', dest='verbose', help='Be verbose', 269 parser.add_option('-v', '--verbose', dest='verbose', help='Be verbose', 260 action='store_true', default=False) 270 action='store_true', default=False) 261 options, args = parser.parse_args() 271 options, args = parser.parse_args() 262 272 263 if len(args) == 0: 273 if len(args) == 0: 264 parser.print_help() 274 parser.print_help() 265 return 1 275 return 1 266 else: 276 else: 267 cmd = ShakespeareAdmin(verbose=options.verbose) 277 cmd = ShakespeareAdmin(verbose=options.verbose) 268 args = ' '.join(args) 278 args = ' '.join(args) 269 args = args.replace('-','_') 279 args = args.replace('-','_') 270 cmd.onecmd(args) 280 cmd.onecmd(args) 271 281 trunk/shakespeare/search.py
Revision 172 Revision 189 1 '''Support for indexing and searching texts using xapian. 1 '''Support for indexing and searching texts using xapian. 2 2 3 Architecture 3 Architecture 4 ============ 4 ============ 5 5 6 For information on theoretical structure of Xapain see: 6 For information on theoretical structure of Xapain see: 7 http://xapian.org/docs/intro_ir.html 7 http://xapian.org/docs/intro_ir.html 8 8 9 For basic demo python code see: http://xapian.org/docs/bindings/python/ 9 For basic demo python code see: http://xapian.org/docs/bindings/python/ 10 10 11 For helpful example of using Xapian in python (including metadata, add_post 11 For helpful example of using Xapian in python (including metadata, add_post 12 etc) see: 12 etc) see: 13 13 14 * http://www.thesamet.com/blog/2007/02/04/pumping-up-your-applications-with-xapian-full-text-search/ 14 * http://www.thesamet.com/blog/2007/02/04/pumping-up-your-applications-with-xapian-full-text-search/ 15 * http://www.rkblog.rk.edu.pl/w/p/xapian-python/ 15 * http://www.rkblog.rk.edu.pl/w/p/xapian-python/ 16 16 17 Here we discuss how we can use Xapian in OS. Two main tasks: 17 Here we discuss how we can use Xapian in OS. Two main tasks: 18 18 19 1. Do search 19 1. Do search 20 2. Produce statistics 20 2. Produce statistics 21 21 22 Second task just requires stemming support, first requires full Xapian 22 Second task just requires stemming support, first requires full Xapian 23 facilities. Main question for indexing is: 23 facilities. Main question for indexing is: 24 24 25 * What is our atomization level. I.e. what are 'documents' we index? Is it: 25 * What is our atomization level. I.e. what are 'documents' we index? Is it: 26 * A whole poem or play 26 * A whole poem or play 27 * Is it a paragraph within a work 27 * Is it a paragraph within a work 28 * Is it a character's whole speech? 28 * Is it a character's whole speech? 29 29 30 TODO: 30 TODO: 31 * add metadata (e.g. which character is speaking, work id ...) 31 * add metadata (e.g. which character is speaking, work id ...) 32 ''' 32 ''' 33 import os 33 import os 34 import re 34 import re 35 35 36 import xapian 36 import xapian 37 37 38 class SearchIndex(object): 38 class SearchIndex(object): 39 def __init__(self, index_dir): 39 def __init__(self, index_dir): 40 self.index_dir = index_dir 40 self.index_dir = index_dir 41 41 42 @classmethod 42 @classmethod 43 def config_index_dir(self): 43 def config_index_dir(self): 44 '''Get the search index directory specified in the config.''' 44 '''Get the search index directory specified in the config.''' 45 import shakespeare 45 import shakespeare 46 conf = shakespeare.conf() 46 conf = shakespeare.conf() 47 index_dir = conf['search_index_dir'] 47 index_dir = conf['search_index_dir'] 48 return index_dir 48 return index_dir 49 49 50 @classmethod 50 @classmethod 51 def default_index(self): 51 def default_index(self): 52 '''Return a SearchIndex instance initialized with the path specified in 52 '''Return a SearchIndex instance initialized with the path specified in 53 the configuration file. 53 the configuration file. 54 ''' 54 ''' 55 index_dir = self.config_index_dir() 55 index_dir = self.config_index_dir() 56 if not os.path.exists(index_dir): 56 if not os.path.exists(index_dir): 57 os.makedirs(index_dir) 57 os.makedirs(index_dir) 58 return SearchIndex(index_dir) 58 return SearchIndex(index_dir) 59 59 60 @classmethod61 def get_stats(self, fileobj):62 '''Get statistics on text in fileobj.63 64 Words are stemmed so that e.g. love and loved count as the same word.65 '''66 # (?) maybe could use xapian.TermGenerator to split document67 WORD_RE = re.compile('\\w{1,32}', re.U)68 stemmer = xapian.Stem('english')69 results = {}70 text = fileobj.read()71 text = text.encode('utf8')72 for term in WORD_RE.finditer(text):73 word = term.group()74 word = word.lower()75 stemmed_word = stemmer(word)76 results[stemmed_word] = results.get(stemmed_word, 0) + 177 return results78 79 def add_item(self, fileobj): 60 def add_item(self, fileobj): 80 document = xapian.WritableDatabase (self.index_dir, xapian.DB_CREATE_OR_OPEN) 61 document = xapian.WritableDatabase (self.index_dir, xapian.DB_CREATE_OR_OPEN) 81 indexer = xapian.TermGenerator() 62 indexer = xapian.TermGenerator() 82 stemmer = xapian.Stem("english") 63 stemmer = xapian.Stem("english") 83 indexer.set_stemmer(stemmer) 64 indexer.set_stemmer(stemmer) 84 65 85 para = '' 66 para = '' 86 try: 67 try: 87 for line in fileobj: 68 for line in fileobj: 88 line = line.strip() 69 line = line.strip() 89 if line == '': 70 if line == '': 90 if para != '': 71 if para != '': 91 doc = xapian.Document() 72 doc = xapian.Document() 92 doc.set_data(para) 73 doc.set_data(para) 93 74 94 indexer.set_document(doc) 75 indexer.set_document(doc) 95 # this *will* include positional information 76 # this *will* include positional information 96 indexer.index_text(para) 77 indexer.index_text(para) 97 78 98 # Add the document to the database. 79 # Add the document to the database. 99 document.add_document(doc) 80 document.add_document(doc) 100 para = '' 81 para = '' 101 else: 82 else: 102 if para != '': 83 if para != '': 103 para += '\n' 84 para += '\n' 104 para += line 85 para += line 105 except StopIteration: 86 except StopIteration: 106 # TODO: what is happening here? 87 # TODO: what is happening here? 107 pass 88 pass 108 89 109 def search(self, query_string): 90 def search(self, query_string): 110 # Open the database for searching. 91 # Open the database for searching. 111 database = xapian.Database(self.index_dir) 92 database = xapian.Database(self.index_dir) 112 93 113 # Start an enquire session. 94 # Start an enquire session. 114 enquire = xapian.Enquire(database) 95 enquire = xapian.Enquire(database) 115 96 116 # Parse the query string to produce a Xapian::Query object. 97 # Parse the query string to produce a Xapian::Query object. 117 qp = xapian.QueryParser() 98 qp = xapian.QueryParser() 118 stemmer = xapian.Stem("english") 99 stemmer = xapian.Stem("english") 119 qp.set_stemmer(stemmer) 100 qp.set_stemmer(stemmer) 120 qp.set_database(database) 101 qp.set_database(database) 121 qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) 102 qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) 122 query = qp.parse_query(query_string) 103 query = qp.parse_query(query_string) 123 print "Parsed query is: %s" % query.get_description() 104 print "Parsed query is: %s" % query.get_description() 124 105 125 # Find the top 10 results for the query. 106 # Find the top 10 results for the query. 126 enquire.set_query(query) 107 enquire.set_query(query) 127 # get search results offset, offset+count 108 # get search results offset, offset+count 128 offset = 0 109 offset = 0 129 count = 10 110 count = 10 130 matches = enquire.get_mset(offset, count) 111 matches = enquire.get_mset(offset, count) 131 return matches 112 return matches 132 113 114 def add_from_path(self, path): 115 '''Add contents of {path} (file itself or all text files in directory 116 if directory) to the search index.''' 117 path = path.strip() 118 if not os.path.exists(path): 119 print '"%s" is not an existent path' % path 120 return 1 121 if os.path.isdir(path): 122 fns = os.listdir(path) 123 fns = filter(lambda x: x.endswith('.txt'), fns) 124 works = [ os.path.join(path, fn) for fn in fns ] 125 else: 126 works = [ path ] 127 for work in works: 128 if self.verbose: 129 print 'Processing %s' % work 130 fileobj = open(work) 131 self.index.add_item(fileobj) 132 133 @classmethod 133 @classmethod 134 def print_matches(self, matches): 134 def print_matches(self, matches): 135 # Display the results. 135 # Display the results. 136 print "%i results found."% matches.get_matches_estimated()136 msg = '%i results found.' % matches.get_matches_estimated() 137 print "Results 1-%i:"% matches.size()137 msg += 'Results 1-%i:' % matches.size() 138 138 139 for m in matches: 139 for m in matches: 140 print "%i: %i%% docid=%i [%s]" % (m.rank + 1, m.percent, m.docid, m.document.get_data()) 140 msg += '\n' 141 msg += '%i: %i%% docid=%i' % (m.rank + 1, m.percent, m.docid) 142 msg += '\n' 143 msg += m.document.get_data() 144 msg += '\n' 145 return msg 141 146 147 trunk/shakespeare/tests/functional/test_search.py
Revision 170 Revision 189 1 import StringIO 2 1 from shakespeare.tests import * 3 from shakespeare.tests import * 2 4 5 import shakespeare.search 6 3 class TestSearchController(TestController): 7 class TestSearchController(TestController): 8 9 def setUp(self): 10 # TODO: remove this item from index in tearDown 11 text = make_fixture() 12 sindex = shakespeare.search.SearchIndex.default_index() 13 sindex.add_item(StringIO.StringIO(text.content)) 4 14 5 def test_index(self): 15 def test_index(self): 6 url = url_for(controller='search') 16 url = url_for(controller='search') 7 res = self.app.get(url) 17 res = self.app.get(url) 8 assert "Search" in res 18 assert "Search" in res 9 19 10 def test_search(self): 20 def test_search(self): 11 url = url_for(controller='search') 21 url = url_for(controller='search') 12 res = self.app.get(url) 22 res = self.app.get(url) 13 form = res.forms[0] 23 form = res.forms[0] 14 # for this to work need to have added phoenix to the index 24 form['query'] = 'summer' 15 # TODO: put this in setUp or something ... 16 form['query'] = 'Phoenix' 17 res = form.submit() 25 res = form.submit() 18 assert 'Search Results' in res 26 assert 'Search Results' in res 19 assert ' Phoenix' in res27 assert 'Shall I compare thee' in res 20 28 trunk/shakespeare/tests/test_search.py
Revision 182 Revision 189 1 import os 1 import os 2 import shutil 2 import shutil 3 import tempfile 3 import tempfile 4 import StringIO 4 import StringIO 5 5 6 import shakespeare.search 6 import shakespeare.search 7 7 8 class TestSearch: 8 class TestSearch: 9 # break up a little to make indexing more interesting 9 # break up a little to make indexing more interesting 10 text = \ 10 text = \ 11 ''' 11 ''' 12 Shall I compare thee to a summer's day? 12 Shall I compare thee to a summer's day? 13 Thou art more lovely and more temperate: 13 Thou art more lovely and more temperate: 14 Rough winds do shake the darling buds of May, 14 Rough winds do shake the darling buds of May, 15 And summer's lease hath all too short a date: 15 And summer's lease hath all too short a date: 16 16 17 Sometime too hot the eye of heaven shines, 17 Sometime too hot the eye of heaven shines, 18 And often is his gold complexion dimm'd, 18 And often is his gold complexion dimm'd, 19 And every fair from fair sometime declines, 19 And every fair from fair sometime declines, 20 By chance, or nature's changing course untrimm'd: 20 By chance, or nature's changing course untrimm'd: 21 21 22 But thy eternal summer shall not fade, 22 But thy eternal summer shall not fade, 23 Nor lose possession of that fair thou ow'st, 23 Nor lose possession of that fair thou ow'st, 24 Nor shall death brag thou wander'st in his shade, 24 Nor shall death brag thou wander'st in his shade, 25 When in eternal lines to time thou grow'st, 25 When in eternal lines to time thou grow'st, 26 26 27 So long as men can breathe, or eyes can see, 27 So long as men can breathe, or eyes can see, 28 So long lives this, and this gives life to thee. 28 So long lives this, and this gives life to thee. 29 ''' 29 ''' 30 30 31 def setUp(self): 31 def setUp(self): 32 basetmp = tempfile.gettempdir() 32 basetmp = tempfile.gettempdir() 33 self.tmpdir = os.path.join(basetmp, 'openshkspr-search') 33 self.tmpdir = os.path.join(basetmp, 'openshkspr-search') 34 # we leave directory in existence to help with debugging 34 # we leave directory in existence to help with debugging 35 if os.path.exists(self.tmpdir): 35 if os.path.exists(self.tmpdir): 36 shutil.rmtree(self.tmpdir) 36 shutil.rmtree(self.tmpdir) 37 os.makedirs(self.tmpdir) 37 os.makedirs(self.tmpdir) 38 self.index = shakespeare.search.SearchIndex(self.tmpdir) 38 self.index = shakespeare.search.SearchIndex(self.tmpdir) 39 39 40 def test_add_item(self): 40 def test_add_item(self): 41 self.index.add_item(StringIO.StringIO(self.text)) 41 self.index.add_item(StringIO.StringIO(self.text)) 42 42 43 def test_search(self): 43 def test_search(self): 44 self.index.add_item(StringIO.StringIO(self.text)) 44 self.index.add_item(StringIO.StringIO(self.text)) 45 out = self.index.search('summer') 45 out = self.index.search('summer') 46 assert len(out) == 2 46 assert len(out) == 2 47 mset1 = out[1] 47 mset1 = out[1] 48 # 'But thy eternal summer ... 48 # 'But thy eternal summer ... 49 exp = "But thy eternal summer shall not fade,\nNor lose possession of that fair thou ow'st," 49 exp = "But thy eternal summer shall not fade,\nNor lose possession of that fair thou ow'st," 50 assert mset1.document.get_data().startswith(exp) 50 assert mset1.document.get_data().startswith(exp) 51 out = self.index.search('rough') 51 out = self.index.search('rough') 52 assert len(out) == 1 52 assert len(out) == 1 53 53 54 def test_get_stats(self):55 simpletext = 'Death death dead love loved loving'56 out = self.index.get_stats(StringIO.StringIO(simpletext))57 assert len(out) == 358 assert out['love'] == 359 assert out['death'] == 260 assert out['dead'] == 161
