Changeset 191
- Timestamp:
- 08/17/08 00:51:24 (3 months ago)
- Files:
-
- trunk/shakespeare/controllers/search.py (modified) (1 diff)
- trunk/shakespeare/search.py (modified) (1 diff)
- trunk/shakespeare/templates/search/index.html (modified) (1 diff)
- trunk/shakespeare/tests/functional/test_search.py (modified) (1 diff)
- trunk/shakespeare/tests/test_search.py (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/shakespeare/controllers/search.py
Revision 170 Revision 191 1 import logging 1 import logging 2 2 3 from shakespeare.lib.base import * 3 from shakespeare.lib.base import * 4 4 5 log = logging.getLogger(__name__) 5 log = logging.getLogger(__name__) 6 6 7 import shakespeare.search 7 import shakespeare.search 8 8 9 class SearchController(BaseController): 9 class SearchController(BaseController): 10 10 11 def index(self): 11 def index(self): 12 query = request.params.get('query', '') 12 c.query = request.params.get('query', '') 13 if query: 13 if c.query: 14 c.matches = self._get_results(query) 14 c.matches = self._get_matches(c.query) 15 c.results = self._get_results(c.matches) 15 c.total = c.matches.get_matches_estimated() 16 c.total = c.matches.get_matches_estimated() 16 else: 17 else: 17 c.matches = None 18 c.matches = None 18 return render('search/index') 19 return render('search/index') 19 20 20 def _get_ results(self, query):21 def _get_matches(self, query): 21 index = shakespeare.search.SearchIndex.default_index() 22 index = shakespeare.search.SearchIndex.default_index() 22 matches = index.search(query )23 matches = index.search(query, numresults=50) 23 return matches 24 return matches 24 25 25 26 def _get_results(self, matches): 27 results = [] 28 for m in matches: 29 text, lineno = self._match_to_text(m) 30 if text: 31 # slight hack -- just attach direct to object 32 text._lineno = lineno 33 text._snippet = m.document.get_data() 34 results.append(text) 35 else: 36 # TODO: create a dummy text ... 37 pass 38 return results 39 40 def _match_to_text(self, m): 41 item_id = m.document.get_value(shakespeare.search.ITEM_ID) 42 text = model.Material.byName(item_id) 43 lineno = m.document.get_value(shakespeare.search.LINE_NO) 44 return (text, lineno) 45 trunk/shakespeare/search.py
Revision 189 Revision 191 1 '''Support for indexing and searching texts using xapian. 1 '''Support for indexing and searching texts using xapian. 2 2 3 Architecture 3 Architecture 4 ============ 4 ============ 5 5 6 For information on theoretical structure of Xapain see: 6 For information on theoretical structure of Xapain see: 7 http://xapian.org/docs/intro_ir.html 7 http://xapian.org/docs/intro_ir.html 8 8 9 For basic demo python code see: http://xapian.org/docs/bindings/python/ 9 For basic demo python code see: http://xapian.org/docs/bindings/python/ 10 10 11 For helpful example of using Xapian in python (including metadata, add_post 11 For helpful example of using Xapian in python (including metadata, add_post 12 etc) see: 12 etc) see: 13 13 14 * http://www.thesamet.com/blog/2007/02/04/pumping-up-your-applications-with-xapian-full-text-search/ 14 * http://www.thesamet.com/blog/2007/02/04/pumping-up-your-applications-with-xapian-full-text-search/ 15 * http://www.rkblog.rk.edu.pl/w/p/xapian-python/ 15 * http://www.rkblog.rk.edu.pl/w/p/xapian-python/ 16 16 17 Here we discuss how we can use Xapian in OS. Two main tasks: 17 Here we discuss how we can use Xapian in OS. Two main tasks: 18 18 19 1. Do search 19 1. Do search 20 2. Produce statistics 20 2. Produce statistics 21 21 22 Second task just requires stemming support, first requires full Xapian 22 Second task just requires stemming support, first requires full Xapian 23 facilities. Main question for indexing is: 23 facilities. Main question for indexing is: 24 24 25 * What is our atomization level. I.e. what are 'documents' we index? Is it: 25 * What is our atomization level. I.e. what are 'documents' we index? Is it: 26 * A whole poem or play 26 * A whole poem or play 27 * Is it a paragraph within a work 27 * Is it a paragraph within a work 28 * Is it a character's whole speech? 28 * Is it a character's whole speech? 29 29 30 TODO: 30 TODO: 31 * add metadata (e.g. which character is speaking, work id ...) 31 * add metadata (e.g. which character is speaking, work id ...) 32 ''' 32 ''' 33 import os 33 import os 34 import re 34 import re 35 35 36 import xapian 36 import xapian 37 37 38 # keys for document values 39 ITEM_ID = 0 40 LINE_NO = 1 41 38 class SearchIndex(object): 42 class SearchIndex(object): 39 def __init__(self, index_dir): 43 def __init__(self, index_dir): 40 self.index_dir = index_dir 44 self.index_dir = index_dir 41 45 42 @classmethod 46 @classmethod 43 def config_index_dir(self): 47 def config_index_dir(self): 44 '''Get the search index directory specified in the config.''' 48 '''Get the search index directory specified in the config.''' 45 import shakespeare 49 import shakespeare 46 conf = shakespeare.conf() 50 conf = shakespeare.conf() 47 index_dir = conf['search_index_dir'] 51 index_dir = conf['search_index_dir'] 48 return index_dir 52 return index_dir 49 53 50 @classmethod 54 @classmethod 51 def default_index(self): 55 def default_index(self): 52 '''Return a SearchIndex instance initialized with the path specified in 56 '''Return a SearchIndex instance initialized with the path specified in 53 the configuration file. 57 the configuration file. 54 ''' 58 ''' 55 index_dir = self.config_index_dir() 59 index_dir = self.config_index_dir() 56 if not os.path.exists(index_dir): 60 if not os.path.exists(index_dir): 57 os.makedirs(index_dir) 61 os.makedirs(index_dir) 58 return SearchIndex(index_dir) 62 return SearchIndex(index_dir) 59 63 60 def add_item(self, fileobj ):64 def add_item(self, fileobj, item_id=None): 61 d ocument = xapian.WritableDatabase(self.index_dir, xapian.DB_CREATE_OR_OPEN)65 database = xapian.WritableDatabase(self.index_dir, xapian.DB_CREATE_OR_OPEN) 62 indexer = xapian.TermGenerator() 66 indexer = xapian.TermGenerator() 63 stemmer = xapian.Stem("english") 67 stemmer = xapian.Stem("english") 64 indexer.set_stemmer(stemmer) 68 indexer.set_stemmer(stemmer) 65 69 66 para = '' 70 para = '' 67 try: 71 try: 72 count = -1 73 para_start = 0 68 for line in fileobj: 74 for line in fileobj: 75 count += 1 69 line = line.strip() 76 line = line.strip() 70 if line == '': 77 if line == '': 71 if para != '': 78 if para != '': 72 doc = xapian.Document() 79 doc = xapian.Document() 73 doc.set_data(para) 80 doc.set_data(para) 81 id_term = 'I' + str(item_id) 82 doc.add_term(id_term) 83 doc.add_value(ITEM_ID, str(item_id)) 84 doc.add_value(LINE_NO, str(para_start)) 74 85 75 indexer.set_document(doc) 86 indexer.set_document(doc) 76 # this *will* include positional information 87 # this *will* include positional information 77 indexer.index_text(para) 88 indexer.index_text(para) 78 89 79 # Add the document to the database.90 database.add_document(doc) 80 document.add_document(doc)91 # assume next para starts 81 para = '' 92 para = '' 93 # must come after 94 para_start = count 82 else: 95 else: 83 if para != '': 96 if para != '': 84 para += '\n' 97 para += '\n' 85 para += line 98 para += line 86 except StopIteration: 99 except StopIteration: 87 # TODO: what is happening here? 100 # TODO: what is happening here? 88 pass101 raise 89 102 90 def search(self, query_string): 103 def get_database(self): 91 # Open the database for searching. 92 database = xapian.Database(self.index_dir) 104 database = xapian.Database(self.index_dir) 105 return database 93 106 94 # Start an enquire session. 107 def search(self, query_string, offset=0, numresults=10): 108 database = self.get_database() 95 enquire = xapian.Enquire(database) 109 enquire = xapian.Enquire(database) 96 97 # Parse the query string to produce a Xapian::Query object.98 qp = xapian.QueryParser() 110 qp = xapian.QueryParser() 99 stemmer = xapian.Stem("english") 111 stemmer = xapian.Stem("english") 100 qp.set_stemmer(stemmer) 112 qp.set_stemmer(stemmer) 101 qp.set_database(database) 113 qp.set_database(database) 102 qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) 114 qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) 103 query = qp.parse_query(query_string) 115 query = qp.parse_query(query_string) 104 print "Parsed query is: %s" % query.get_description()105 106 # Find the top 10 results for the query.107 enquire.set_query(query) 116 enquire.set_query(query) 108 # get search results offset, offset+count 117 matches = enquire.get_mset(offset, numresults) 109 offset = 0 110 count = 10 111 matches = enquire.get_mset(offset, count) 112 return matches 118 return matches 113 119 114 def add_from_path(self, path): 120 def add_from_path(self, path): 115 '''Add contents of {path} (file itself or all text files in directory 121 '''Add contents of {path} (file itself or all text files in directory 116 if directory) to the search index.''' 122 if directory) to the search index.''' 117 path = path.strip() 123 path = path.strip() 118 if not os.path.exists(path): 124 if not os.path.exists(path): 119 print '"%s" is not an existent path' % path 125 print '"%s" is not an existent path' % path 120 return 1 126 return 1 121 if os.path.isdir(path): 127 if os.path.isdir(path): 122 fns = os.listdir(path) 128 fns = os.listdir(path) 123 fns = filter(lambda x: x.endswith('.txt'), fns) 129 fns = filter(lambda x: x.endswith('.txt'), fns) 124 works = [ os.path.join(path, fn) for fn in fns ] 130 works = [ os.path.join(path, fn) for fn in fns ] 125 else: 131 else: 126 works = [ path ] 132 works = [ path ] 127 for work in works: 133 for work in works: 128 if self.verbose:129 print 'Processing %s' % work130 fileobj = open(work) 134 fileobj = open(work) 131 self. index.add_item(fileobj)135 self.add_item(fileobj) 132 136 133 @classmethod 137 @classmethod 134 def print_matches(self, matches): 138 def print_matches(self, matches): 135 # Display the results. 139 # Display the results. 136 msg = '%i results found.' % matches.get_matches_estimated() 140 msg = '%i results found.' % matches.get_matches_estimated() 137 msg += 'Results 1-%i:' % matches.size() 141 msg += 'Results 1-%i:' % matches.size() 138 142 139 for m in matches: 143 for m in matches: 140 msg += '\n' 144 msg += '\n' 141 msg += '%i: %i%% docid=%i' % (m.rank + 1, m.percent, m.docid) 145 msg += '%i: %i%% docid=%i' % (m.rank + 1, m.percent, m.docid) 142 msg += '\n' 146 msg += '\n' 143 msg += m.document.get_data() 147 msg += m.document.get_data() 144 msg += '\n' 148 msg += '\n' 145 return msg 149 return msg 146 150 147 trunk/shakespeare/templates/search/index.html
Revision 181 Revision 191 1 <html xmlns:py="http://genshi.edgewall.org/" 1 <html xmlns:py="http://genshi.edgewall.org/" 2 xmlns:xi="http://www.w3.org/2001/XInclude"> 2 xmlns:xi="http://www.w3.org/2001/XInclude"> 3 3 4 <py:def function="page_title">Search Shakespeare's Work</py:def> 4 <py:def function="page_title">Search Shakespeare's Work</py:def> 5 5 6 <div py:match="content"> 6 <div py:match="content"> 7 <form name="test" method="GET" action=""> 7 <form name="test" method="GET" action=""> 8 <input type="text" name="query" /> 8 <input type="text" name="query" /> 9 <input type="submit" name="submit" value="Submit" /> 9 <input type="submit" name="submit" value="Submit" /> 10 </form> 10 </form> 11 11 12 <div class="search-results" py:if="c.matches is not None"> 12 <div class="search-results" py:if="c.matches is not None"> 13 <h3>Search Results </h3>13 <h3>Search Results For: ${c.query}</h3> 14 There were ${c.total} results. 14 There were ${c.total} results. 15 <ul> 15 <ul> 16 <li py:for="m in c.matches"> 16 <li py:for="m in c.results"> 17 ${m.document.get_data()} 17 Work: ${m.title}, Line: ${m._lineno} 18 <blockquote> 19 <pre>${m._snippet}</pre> 20 </blockquote> 18 </li> 21 </li> 19 </ul> 22 </ul> 20 </div> 23 </div> 21 </div> 24 </div> 22 25 23 <xi:include href="../layout.html" /> 26 <xi:include href="../layout.html" /> 24 </html> 27 </html> trunk/shakespeare/tests/functional/test_search.py
Revision 189 Revision 191 1 import StringIO 1 import StringIO 2 2 3 from shakespeare.tests import * 3 from shakespeare.tests import * 4 4 5 import shakespeare.search 5 import shakespeare.search 6 6 7 class TestSearchController(TestController): 7 class TestSearchController(TestController): 8 8 9 def setUp(self): 9 def setUp(self): 10 # TODO: remove this item from index in tearDown 10 # TODO: remove this item from index in tearDown 11 text = make_fixture() 11 text = make_fixture() 12 sindex = shakespeare.search.SearchIndex.default_index() 12 sindex = shakespeare.search.SearchIndex.default_index() 13 sindex.add_item(StringIO.StringIO(text.content) )13 sindex.add_item(StringIO.StringIO(text.content), text.name) 14 14 15 def test_index(self): 15 def test_index(self): 16 url = url_for(controller='search') 16 url = url_for(controller='search') 17 res = self.app.get(url) 17 res = self.app.get(url) 18 assert "Search" in res 18 assert "Search" in res 19 19 20 def test_search(self): 20 def test_search(self): 21 url = url_for(controller='search') 21 url = url_for(controller='search') 22 res = self.app.get(url) 22 res = self.app.get(url) 23 form = res.forms[0] 23 form = res.forms[0] 24 form['query'] = 'summer' 24 form['query'] = 'summer' 25 res = form.submit() 25 res = form.submit() 26 # print res 26 assert 'Search Results' in res 27 assert 'Search Results' in res 27 assert 'Shall I compare thee' in res 28 assert 'Shall I compare thee' in res 28 29 trunk/shakespeare/tests/test_search.py
Revision 189 Revision 191 1 import os 1 import os 2 import shutil 2 import shutil 3 import tempfile 3 import tempfile 4 import StringIO 4 import StringIO 5 5 6 import shakespeare.search 6 import shakespeare.search 7 import shakespeare.tests 7 8 8 class TestSearch: 9 class TestSearch: 9 # break up a little to make indexing more interesting10 text = \11 '''12 Shall I compare thee to a summer's day?13 Thou art more lovely and more temperate:14 Rough winds do shake the darling buds of May,15 And summer's lease hath all too short a date:16 17 Sometime too hot the eye of heaven shines,18 And often is his gold complexion dimm'd,19 And every fair from fair sometime declines,20 By chance, or nature's changing course untrimm'd:21 22 But thy eternal summer shall not fade,23 Nor lose possession of that fair thou ow'st,24 Nor shall death brag thou wander'st in his shade,25 When in eternal lines to time thou grow'st,26 27 So long as men can breathe, or eyes can see,28 So long lives this, and this gives life to thee.29 '''30 31 def setUp(self): 10 def setUp(self): 11 self.text = shakespeare.tests.make_fixture() 32 basetmp = tempfile.gettempdir() 12 basetmp = tempfile.gettempdir() 33 self.tmpdir = os.path.join(basetmp, 'openshkspr-search') 13 self.tmpdir = os.path.join(basetmp, 'openshkspr-search') 34 # we leave directory in existence to help with debugging 14 # we leave directory in existence to help with debugging 35 if os.path.exists(self.tmpdir): 15 if os.path.exists(self.tmpdir): 36 shutil.rmtree(self.tmpdir) 16 shutil.rmtree(self.tmpdir) 37 os.makedirs(self.tmpdir) 17 os.makedirs(self.tmpdir) 38 self.index = shakespeare.search.SearchIndex(self.tmpdir) 18 self.index = shakespeare.search.SearchIndex(self.tmpdir) 19 self.index.add_item(StringIO.StringIO(self.text.content), 20 self.text.name) 21 22 # TODO: remove the document from the index 39 23 40 def test_add_item(self): 24 def test_add_item(self): 41 self.index.add_item(StringIO.StringIO(self.text))25 assert self.index.get_database().get_doccount() > 0 42 26 43 def test_search(self): 27 def test_search(self): 44 self.index.add_item(StringIO.StringIO(self.text))45 out = self.index.search('summer') 28 out = self.index.search('summer') 46 assert len(out) == 2 29 assert len(out) == 2 47 mset1 = out[1] 30 mset1 = out[1] 48 # 'But thy eternal summer ... 31 # 'But thy eternal summer ... 49 exp = "But thy eternal summer shall not fade,\nNor lose possession of that fair thou ow'st," 32 exp = "But thy eternal summer shall not fade,\nNor lose possession of that fair thou ow'st," 50 assert mset1.document.get_data().startswith(exp) 33 assert mset1.document.get_data().startswith(exp) 51 out = self.index.search('rough') 34 out = self.index.search('rough') 52 assert len(out) == 1 35 assert len(out) == 1 36 37 def test_retrieve_lineno(self): 38 out = self.index.search('summer') 39 mset1 = out[1] 40 lineno = mset1.document.get_value(shakespeare.search.LINE_NO) 41 assert lineno == '9' 53 42 43 def test_retrieve_itemid(self): 44 out = self.index.search('summer') 45 mset1 = out[1] 46 name = mset1.document.get_value(shakespeare.search.ITEM_ID) 47 assert name == self.text.name 48
