Changeset 173
- Timestamp:
- 08/10/08 15:48:51 (3 months ago)
- Files:
-
- trunk/milton/__init__.py (modified) (1 diff)
- trunk/milton/__init__.pyc (added)
- trunk/milton/cache.py (modified) (1 diff)
- trunk/milton/cache.pyc (added)
- trunk/milton/cli.py (added)
- trunk/milton/cli.pyc (added)
- trunk/milton/concordance.py (modified) (1 diff)
- trunk/milton/format.py (modified) (1 diff)
- trunk/milton/gutenberg.py (modified) (1 diff)
- trunk/milton/index.py (modified) (1 diff)
- trunk/milton/search.py (added)
- trunk/milton/websetup.py (added)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/milton/__init__.py
Revision 141 Revision 173 1 __version__ = '0.1' 1 ''' 2 Introduction 3 ************ 4 5 The Open Milton package provides a full open set of shakespeare's works 6 (often in multiple versions) along with ancillary material, a variety of tools 7 and a python API. 8 9 Specifically in addition to the works themselves (often in multiple versions) 10 there is an introduction, a chronology, explanatory notes, a concordance and 11 search facilities. 12 13 All material is open source/open knowledge so that anyone can use, redistribute 14 and reuse these materials freely. For exact details of the license under which 15 this package is made available please see COPYING.txt. 16 17 Open Milton has been developed under the aegis of the Open Knowledge 18 Foundation (http://www.okfn.org/). It is a sub-project of Open Shakespeare. 19 20 Contact the Project 21 ******************* 22 23 Please mail info@okfn.org or join the okfn-discuss mailing list: 24 25 http://lists.okfn.org/listinfo/okfn-discuss 26 27 28 Installation and Setup 29 ********************** 30 31 1. Install the code 32 =================== 33 34 1.1: (EITHER) Install using setup.py (preferred) 35 ------------------------------------------------ 36 37 Install ``milton`` using easy_install:: 38 39 easy_install shakespeare 40 41 NB: If you don't have easy_install you can get from here: 42 43 <http://peak.telecommunity.com/DevCenter/EasyInstall#installation-instructions> 44 45 46 1.2 (OR) Get the code straight from subversion 47 ------------------------------------------------ 48 49 1. Check out the subversion trunk:: 50 51 svn co https://knowledgeforge.net/shakespeare/svn/trunk 52 53 2. Do:: 54 55 sudo python setup.py develop 56 57 58 Getting Started 59 *************** 60 61 As a user: 62 ========== 63 64 1. Basic setup 65 -------------- 66 67 To access most of the main features of Open Milton you need a database. 68 For this an other bits and bobs of configuration you will need a configuration 69 file. 70 71 You can make a config file as follows:: 72 73 paster make-config milton {your-config.ini} 74 75 Tweak the config file as appropriate and then setup the application:: 76 77 paster setup-app config.ini 78 79 [TODO: this should be part of setup-app] 80 81 Run:: 82 83 $ milton-admin db create 84 $ milton-admin db init 85 86 2. Extras 87 --------- 88 89 1. Search index. [TODO] 90 91 2. You can start a web server to provide a easy-to-use web interface to the 92 shakespeare material and facilities by doing:: 93 94 $ paster serve {your-config.ini} 95 96 NB: {your-config.ini} should be replaced with the name of the config file you 97 created earlier. 98 99 100 As a developer: 101 =============== 102 103 0. Setup 104 -------- 105 106 Follow the basic steps above put with an ini file named: development.ini 107 108 NB: you'll probably want to change log levels to debug. 109 110 1. Check out the administrative commands 111 ---------------------------------------- 112 113 $ bin/milton-admin help. 114 115 2. Run the tests using either py.test of nosetests:: 116 ---------------------------------------------------- 117 118 $ nosetests milton 119 ''' 120 __version__ = '0.2dev' 2 __application_name__ = 'milton' 121 __application_name__ = 'milton' 3 122 4 def conf(): 123 def conf(): 5 import os 124 import os 6 defaultPath = os.path.abspath('./ etc/%s.conf' % __application_name__)125 defaultPath = os.path.abspath('./development.ini') 7 envVarName = __application_name__.upper() + 'CONF' 126 envVarName = __application_name__.upper() + 'CONF' 8 confPath = os.environ.get(envVarName, defaultPath) 127 confPath = os.environ.get(envVarName, defaultPath) 9 if not os.path.exists(confPath): 128 if not os.path.exists(confPath): 10 raise ValueError('No Configuration file exists at: %s' % confPath) 129 raise ValueError('No Configuration file exists at: %s' % confPath) 11 import ConfigParser 130 12 conf = ConfigParser.SafeConfigParser() 131 # register the config 13 conf.read(confPath) 132 import paste.deploy 133 import milton.config.environment 134 pasteconf = paste.deploy.appconfig('config:' + confPath) 135 136 milton.config.environment.load_environment(pasteconf.global_conf, 137 pasteconf.local_conf) 138 from pylons import config 139 conf = config 140 141 # import ConfigParser 142 # conf = ConfigParser.SafeConfigParser() 143 # conf.read(confPath) 144 14 return conf 145 return conf 15 146 trunk/milton/cache.py
Revision 141 Revision 173 1 import os 1 import os 2 import urllib 2 import urllib 3 3 4 import milton 4 import milton 5 conf = milton.conf() 5 conf = milton.conf() 6 6 7 class Cache(object): 7 class Cache(object): 8 """Provide a local filesystem cache for material. 8 """Provide a local filesystem cache for material. 9 """ 9 """ 10 10 11 def __init__(self, cache_path): 11 def __init__(self, cache_path): 12 self.cache_path = cache_path 12 self.cache_path = cache_path 13 13 14 def path(self, remote_url, version=''): 14 def path(self, remote_url, version=''): 15 """Get local path to text of remote url. 15 """Get local path to text of remote url. 16 @type: string giving version of text (''|'cleaned') 16 @type: string giving version of text (''|'cleaned') 17 """ 17 """ 18 protocolEnd = remote_url.index(':') + 3 # add 3 for :// 18 protocolEnd = remote_url.index(':') + 3 # add 3 for :// 19 path = remote_url[protocolEnd:] 19 path = remote_url[protocolEnd:] 20 base, name = os.path.split(path) 20 base, name = os.path.split(path) 21 name = version + name 21 name = version + name 22 offset = os.path.join(base, name) 22 offset = os.path.join(base, name) 23 localPath = self.path_from_offset(offset) 23 localPath = self.path_from_offset(offset) 24 return localPath 24 return localPath 25 25 26 def download_url(self, url, overwrite=False): 26 def download_url(self, url, overwrite=False): 27 """Download a url to the local cache 27 """Download a url to the local cache 28 @overwrite: if True overwrite an existing local copy otherwise don't 28 @overwrite: if True overwrite an existing local copy otherwise don't 29 """ 29 """ 30 localPath = self.path(url) 30 localPath = self.path(url) 31 dirpath = os.path.dirname(localPath) 31 dirpath = os.path.dirname(localPath) 32 if overwrite or not(os.path.exists(localPath)): 32 if overwrite or not(os.path.exists(localPath)): 33 if not os.path.exists(dirpath): 33 if not os.path.exists(dirpath): 34 os.makedirs(dirpath) 34 os.makedirs(dirpath) 35 # use wget as it seems to work more reliably on wikimedia 35 # use wget as it seems to work more reliably on wikimedia 36 # rgrp: 2008-03-18 use urllib rather than wget despite these issues 36 # rgrp: 2008-03-18 use urllib rather than wget despite these issues 37 # as wget is fairly specific to linux/unix and even there may not 37 # as wget is fairly specific to linux/unix and even there may not 38 # be installed. 38 # be installed. 39 # cmd = 'wget -O %s %s' % (localPath, url) 39 # cmd = 'wget -O %s %s' % (localPath, url) 40 # os.system(cmd) 40 # os.system(cmd) 41 urllib.urlretrieve(url, localPath) 41 urllib.urlretrieve(url, localPath) 42 42 43 def path_from_offset(self, offset): 43 def path_from_offset(self, offset): 44 "Get full path of file in cache given by offset." 44 "Get full path of file in cache given by offset." 45 return os.path.join(self.cache_path, offset) 45 return os.path.join(self.cache_path, offset) 46 46 47 47 48 default_path = milton.conf() .get('misc', 'cachedir')48 default_path = milton.conf()['cachedir'] 49 default = Cache(default_path) 49 default = Cache(default_path) 50 50 trunk/milton/concordance.py
Revision 141 Revision 173 1 """ 1 """ 2 Concordance (and statistics) for texts in database. 2 Concordance (and statistics) for texts in database. 3 3 4 To build concordance use ConcordanceBuilder. To access concordance/statistics 4 To build concordance use ConcordanceBuilder. To access concordance/statistics 5 use Concordance/Statistics class. Concordance and statistics are provided as 5 use Concordance/Statistics class. Concordance and statistics are provided as 6 dictionaries keyed by words. 6 dictionaries keyed by words. 7 7 8 NB: all word keys have been lower-cased in order to render them 8 NB: all word keys have been lower-cased in order to render them 9 case-insensitive 9 case-insensitive 10 """ 10 """ 11 import re 11 import re 12 12 13 import sqlobject 13 import sqlobject 14 14 15 import milton.index 15 import milton.index 16 import milton.cache 16 import milton.cache 17 17 18 18 19 class ConcordanceBase(object): 19 class ConcordanceBase(object): 20 """ 20 """ 21 TODO: caching?? 21 TODO: caching?? 22 """ 22 """ 23 sqlcc = milton. dm.Concordance23 sqlcc = milton.model.Concordance 24 sqlstat = milton. dm.Statistic24 sqlstat = milton.model.Statistic 25 25 26 def __init__(self, filter_names=None): 26 def __init__(self, filter_names=None): 27 """ 27 """ 28 @param filter_names: a list of id names with which to filter results 28 @param filter_names: a list of id names with which to filter results 29 (i.e. only return results relating to those texts) 29 (i.e. only return results relating to those texts) 30 """ 30 """ 31 self._filter_names = filter_names 31 self._filter_names = filter_names 32 self.sqlcc_filter = self._make_filter(self.sqlcc) 32 self.sqlcc_filter = self._make_filter(self.sqlcc) 33 self.sqlstat_filter = self._make_filter(self.sqlstat) 33 self.sqlstat_filter = self._make_filter(self.sqlstat) 34 34 35 def _make_filter(self, sqlobj): 35 def _make_filter(self, sqlobj): 36 sql_filter = True 36 sql_filter = True 37 if self._filter_names is not None: 37 if self._filter_names is not None: 38 arglist = [] 38 arglist = [] 39 for name in self._filter_names: 39 for name in self._filter_names: 40 newarg = sqlobj.q.textID == self._name2id(name) 40 newarg = sqlobj.q.textID == self._name2id(name) 41 arglist.append(newarg) 41 arglist.append(newarg) 42 sql_filter = sqlobject.OR(*arglist) 42 sql_filter = sqlobject.OR(*arglist) 43 return sql_filter 43 return sql_filter 44 44 45 def _name2id(self, name): 45 def _name2id(self, name): 46 return milton. dm.Material.byName(name).id46 return milton.model.Material.byName(name).id 47 47 48 def keys(self): 48 def keys(self): 49 """Return list of *distinct* words in concordance/statistics 49 """Return list of *distinct* words in concordance/statistics 50 """ 50 """ 51 all = self.sqlstat.select(self.sqlstat_filter, 51 all = self.sqlstat.select(self.sqlstat_filter, 52 orderBy=self.sqlstat.q.word, 52 orderBy=self.sqlstat.q.word, 53 ) 53 ) 54 words = [ xx.word for xx in list(all) ] 54 words = [ xx.word for xx in list(all) ] 55 distinct = list(set(words)) 55 distinct = list(set(words)) 56 distinct.sort() 56 distinct.sort() 57 return distinct 57 return distinct 58 58 59 59 60 class Concordance(ConcordanceBase): 60 class Concordance(ConcordanceBase): 61 """Concordance by word for a set of texts 61 """Concordance by word for a set of texts 62 """ 62 """ 63 63 64 def get(self, word): 64 def get(self, word): 65 """Get list of occurrences for word 65 """Get list of occurrences for word 66 @return: sqlobject query list 66 @return: sqlobject query list 67 """ 67 """ 68 select = self.sqlcc.select(sqlobject.AND(self.sqlcc_filter, self.sqlcc.q.word==word)) 68 select = self.sqlcc.select(sqlobject.AND(self.sqlcc_filter, self.sqlcc.q.word==word)) 69 return select 69 return select 70 70 71 class Statistics(ConcordanceBase): 71 class Statistics(ConcordanceBase): 72 72 73 def get(self, word): 73 def get(self, word): 74 select = self.sqlstat.select( 74 select = self.sqlstat.select( 75 sqlobject.AND(self.sqlstat_filter, self.sqlstat.q.word==word) 75 sqlobject.AND(self.sqlstat_filter, self.sqlstat.q.word==word) 76 ) 76 ) 77 total = 0 77 total = 0 78 for stat in select: 78 for stat in select: 79 total += stat.occurrences 79 total += stat.occurrences 80 return total 80 return total 81 81 82 class ConcordanceBuilder(object): 82 class ConcordanceBuilder(object): 83 """Build a concordance and associated statistics for a set of texts. 83 """Build a concordance and associated statistics for a set of texts. 84 84 85 """ 85 """ 86 86 87 # multiline, unicode and ignorecase 87 # multiline, unicode and ignorecase 88 word_regex = re.compile(r'\b(\w+)\b', re.U | re.M | re.I) 88 word_regex = re.compile(r'\b(\w+)\b', re.U | re.M | re.I) 89 89 90 words_to_ignore = [ 90 words_to_ignore = [ 91 # 'a', 'the', 'and', 'as', 'are', 'be', 'but', 'in' 91 # 'a', 'the', 'and', 'as', 'are', 'be', 'but', 'in' 92 ] 92 ] 93 non_words = [ 93 non_words = [ 94 'd', # accus'd 94 'd', # accus'd 95 't', 95 't', 96 ] 96 ] 97 97 98 def is_roman_numeral(self, word): 98 def is_roman_numeral(self, word): 99 digits = [ 'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix' ] 99 digits = [ 'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix' ] 100 others = [ 'l', 'x', 'c' ] 100 others = [ 'l', 'x', 'c' ] 101 if word == 'i': return False # exception because this conflicts with I 101 if word == 'i': return False # exception because this conflicts with I 102 while word[0] in others: 102 while word[0] in others: 103 if len(word) == 1: 103 if len(word) == 1: 104 return True 104 return True 105 else: 105 else: 106 word = word[1:] 106 word = word[1:] 107 return word in digits 107 return word in digits 108 108 109 def ignore_word(self, word): 109 def ignore_word(self, word): 110 "Return True if this word should not be added to the concordance." 110 "Return True if this word should not be added to the concordance." 111 bool1 = word in self.words_to_ignore 111 bool1 = word in self.words_to_ignore 112 bool2 = word in self.non_words 112 bool2 = word in self.non_words 113 # do roman numerals 113 # do roman numerals 114 bool3 = self.is_roman_numeral(word) 114 bool3 = self.is_roman_numeral(word) 115 return bool1 or bool2 or bool3 115 return bool1 or bool2 or bool3 116 116 117 def _text_already_done(self, text): 117 def _text_already_done(self, text): 118 numrecs = milton. dm.Concordance.select(118 numrecs = milton.model.Concordance.select( 119 milton. dm.Concordance.q.textID==text.id119 milton.model.Concordance.q.textID==text.id 120 ).count() 120 ).count() 121 return numrecs > 0 121 return numrecs > 0 122 122 123 def add_text(self, name, text=None): 123 def add_text(self, name, text=None): 124 """Add a text to the concordance. 124 """Add a text to the concordance. 125 @param name: name of text to add 125 @param name: name of text to add 126 @param text: [optional] a file-like object containing text data. If not 126 @param text: [optional] a file-like object containing text data. If not 127 provided will default to using file in cache associated with named 127 provided will default to using file in cache associated with named 128 text 128 text 129 """ 129 """ 130 dmText = milton. dm.Material.byName(name)130 dmText = milton.model.Material.byName(name) 131 if self._text_already_done(dmText): 131 if self._text_already_done(dmText): 132 msg = 'Have already added to concordance text: %s' % dmText 132 msg = 'Have already added to concordance text: %s' % dmText 133 # raise ValueError(msg) 133 # raise ValueError(msg) 134 print msg 134 print msg 135 print 'Skipping' 135 print 'Skipping' 136 return 136 return 137 if text is None: 137 if text is None: 138 tpath = dmText.get_cache_path('plain') 138 tpath = dmText.get_cache_path('plain') 139 text = file(tpath) 139 text = file(tpath) 140 lineCount = 0 140 lineCount = 0 141 charIndex = 0 141 charIndex = 0 142 stats = {} 142 stats = {} 143 trans = milton. dm.Concordance._connection.transaction()143 trans = milton.model.Concordance._connection.transaction() 144 for line in text.readlines(): 144 for line in text.readlines(): 145 for match in self.word_regex.finditer(line): 145 for match in self.word_regex.finditer(line): 146 word = match.group().lower() # case insensitive 146 word = match.group().lower() # case insensitive 147 if self.ignore_word(word): 147 if self.ignore_word(word): 148 continue 148 continue 149 milton. dm.Concordance(connection=trans,149 milton.model.Concordance(connection=trans, 150 text=dmText, 150 text=dmText, 151 word=word, 151 word=word, 152 line=lineCount, 152 line=lineCount, 153 char_index=charIndex+match.start()) 153 char_index=charIndex+match.start()) 154 stats[word] = stats.get(word, 0) + 1 154 stats[word] = stats.get(word, 0) + 1 155 lineCount += 1 155 lineCount += 1 156 charIndex += len(line) 156 charIndex += len(line) 157 trans.commit() 157 trans.commit() 158 trans = milton. dm.Concordance._connection.transaction()158 trans = milton.model.Concordance._connection.transaction() 159 for word, value in stats.items(): 159 for word, value in stats.items(): 160 tresults = milton. dm.Statistic.select(160 tresults = milton.model.Statistic.select( 161 sqlobject.AND( 161 sqlobject.AND( 162 milton. dm.Statistic.q.textID == dmText.id,162 milton.model.Statistic.q.textID == dmText.id, 163 milton. dm.Statistic.q.word == word163 milton.model.Statistic.q.word == word 164 )) 164 )) 165 try: 165 try: 166 dbstat = list(tresults)[0] 166 dbstat = list(tresults)[0] 167 dbstat.occurrences += value 167 dbstat.occurrences += value 168 except: 168 except: 169 milton. dm.Statistic(169 milton.model.Statistic( 170 connection=trans, 170 connection=trans, 171 text=dmText, 171 text=dmText, 172 word=word, 172 word=word, 173 occurrences=value 173 occurrences=value 174 ) 174 ) 175 trans.commit() 175 trans.commit() 176 176 177 177 178 def remove_text(self, name): 178 def remove_text(self, name): 179 """Remove a text from the concordance. 179 """Remove a text from the concordance. 180 180 181 @param name: as for add_text 181 @param name: as for add_text 182 """ 182 """ 183 dmText = milton. dm.Material.byName(name)183 dmText = milton.model.Material.byName(name) 184 recs = milton. dm.Concordance.select(184 recs = milton.model.Concordance.select( 185 milton. dm.Concordance.q.textID==dmText.id185 milton.model.Concordance.q.textID==dmText.id 186 ) 186 ) 187 for rec in recs: 187 for rec in recs: 188 milton. dm.Concordance.delete(rec.id)188 milton.model.Concordance.delete(rec.id) 189 stats = milton. dm.Statistic.select(189 stats = milton.model.Statistic.select( 190 milton. dm.Statistic.q.textID==dmText.id190 milton.model.Statistic.q.textID==dmText.id 191 ) 191 ) 192 for stat in stats: 192 for stat in stats: 193 milton. dm.Statistic.delete(stat.id)193 milton.model.Statistic.delete(stat.id) 194 194 trunk/milton/format.py
Revision 141 Revision 173 1 """ 1 """ 2 Format texts in a variety of ways 2 Format texts in a variety of ways 3 """ 3 """ 4 4 5 def format_text(fileobj, format): 5 def format_text(fileobj, format): 6 """Format a provided text in a variety of ways. 6 """Format a provided text in a variety of ways. 7 7 8 @format: the name specifying the format to use 8 @format: the name specifying the format to use 9 """ 9 """ 10 formatter = None 10 formatter = None 11 if format == 'plain': 11 if format == 'plain': 12 formatter = TextFormatterPlain() 12 formatter = TextFormatterPlain() 13 elif format == 'lineno': 13 elif format == 'lineno': 14 formatter = TextFormatterLineno() 14 formatter = TextFormatterLineno() 15 elif format == 'annotate': 15 elif format == 'annotate': 16 formatter = TextFormatterAnnotate() 16 formatter = TextFormatterAnnotate() 17 else: 17 else: 18 raise ValueError('Unknown format: %s' % format) 18 raise ValueError('Unknown format: %s' % format) 19 return formatter.format(fileobj) 19 return formatter.format(fileobj) 20 20 21 21 22 class TextFormatter(object): 22 class TextFormatter(object): 23 """Abstract base class for formatters. 23 """Abstract base class for formatters. 24 """ 24 """ 25 25 26 def format(self, file): 26 def format(self, file): 27 """Format the supplied text. 27 """Format the supplied text. 28 28 29 @file: file-like object containing a text in plain txt with utf-8 29 @file: file-like object containing a text in plain txt with utf-8 30 encoding 30 encoding 31 31 32 @return a string in unicode format with utf-8 encoding 32 @return a string in unicode format with utf-8 encoding 33 """ 33 """ 34 raise NotImplementedError() 34 raise NotImplementedError() 35 35 36 def escape_chars(self, text): 36 def escape_chars(self, text): 37 return text.replace('&', '&').replace('<', '<') 37 return text.replace('&', '&').replace('<', '<') 38 38 39 class TextFormatterPlain(TextFormatter): 39 class TextFormatterPlain(TextFormatter): 40 """Format the text as plain text (in an html <pre> tag). 40 """Format the text as plain text (in an html <pre> tag). 41 """ 41 """ 42 42 43 def format(self, file): 43 def format(self, file): 44 self.file = file 44 self.file = file 45 out = unicode(self.file.read(), 'utf-8') 45 out = unicode(self.file.read(), 'utf-8') 46 out = self.escape_chars(out) 46 out = self.escape_chars(out) 47 out = \ 47 out = \ 48 u''' 48 u''' 49 <pre> 49 <pre> 50 %s 50 %s 51 </pre>''' % out 51 </pre>''' % out 52 return out 52 return out 53 53 54 class TextFormatterLineno(TextFormatter): 54 class TextFormatterLineno(TextFormatter): 55 """Format the text to have line numbers. 55 """Format the text to have line numbers. 56 """ 56 """ 57 57 58 def format(self, file): 58 def format(self, file): 59 self.file = file 59 self.file = file 60 result = '' 60 result = '' 61 count = 0 61 count = 0 62 for line in self.file.readlines(): 62 for line in self.file.readlines(): 63 #line count updated from 464 tlineno = unicode(count).ljust(5) # assume line no < 100000 63 tlineno = unicode(count).ljust(5) # assume line no < 100000 65 tline = unicode(line, 'utf-8').rstrip() 64 tline = unicode(line, 'utf-8').rstrip() 66 tline = self.escape_chars(tline) 65 tline = self.escape_chars(tline) 67 result += '<pre id="%s">%s %s</pre>' % (count, tlineno, tline) 66 result += '<pre id="%s">%s %s</pre>' % (count, tlineno, tline) 68 count += 1 67 count += 1 69 return result 68 return result 70 69 71 70 72 import annotater.marginalia 71 import annotater.marginalia 73 class TextFormatterAnnotate(TextFormatter): 72 class TextFormatterAnnotate(TextFormatter): 74 """Format the text in a manner suitable for marginalia annotation. 73 """Format the text in a manner suitable for marginalia annotation. 75 """ 74 """ 76 75 77 def format(self, file, **kwargs): 76 def format(self, file, **kwargs): 78 self.file = file 77 self.file = file 79 # todo chunking 78 # todo chunking 80 line_numberer = TextFormatterLineno() 79 line_numberer = TextFormatterLineno() 81 text_with_linenos = line_numberer.format(self.file) 80 text_with_linenos = line_numberer.format(self.file) 82 values = { 81 values = { 83 'content' : text_with_linenos, 82 'content' : text_with_linenos, 84 'id' : 'm0', 83 'id' : 'm0', 85 } 84 } 86 for key in kwargs: 85 for key in kwargs: 87 values[key] = kwargs[key] 86 values[key] = kwargs[key] 88 result = annotater.marginalia.format_entry(**values) 87 result = annotater.marginalia.format_entry(**values) 89 return result 88 return result 90 89 trunk/milton/gutenberg.py
Revision 141 Revision 173 1 """Various useful functionality related to Project Gutenberg 1 """Various useful functionality related to Project Gutenberg 2 """ 2 """ 3 import os 3 import os 4 import StringIO 4 import StringIO 5 import milton.cache 5 import milton.cache 6 6 7 7 8 class GutenbergIndex(object): 8 class GutenbergIndex(object): 9 """Parse the index of Gutenberg works so as to find Milton works. 9 """Parse the index of Gutenberg works so as to find Milton works. 10 10 11 11 12 """ 12 """ 13 13 14 # url for the Gutenberg index file 14 # url for the Gutenberg index file 15 gutindex = 'http://www.gutenberg.org/dirs/GUTINDEX.ALL' 15 gutindex = 'http://www.gutenberg.org/dirs/GUTINDEX.ALL' 16 16 17 def __init__(self): 17 def __init__(self): 18 self.download_gutenberg_index() 18 self.download_gutenberg_index() 19 self._gutindex_local_path = milton.cache.default.path(self.gutindex) 19 self._gutindex_local_path = milton.cache.default.path(self.gutindex) 20 20 21 def download_gutenberg_index(self): 21 def download_gutenberg_index(self): 22 """Download the Gutenberg Index file GUTINDEX.ALL to cache if we don't 22 """Download the Gutenberg Index file GUTINDEX.ALL to cache if we don't 23 have it already. 23 have it already. 24 """ 24 """ 25 milton.cache.default.download_url(self.gutindex) 25 milton.cache.default.download_url(self.gutindex) 26 26 27 def make_url(self, year, idStr): 27 def make_url(self, year, idStr): 28 return 'http://www.gutenberg.org/dirs/etext%s/%s10.txt' % (year[2:], idStr) 28 return 'http://www.gutenberg.org/dirs/etext%s/%s10.txt' % (year[2:], idStr) 29 29 30 def get_milton_list(self): 30 def get_milton_list(self): 31 """Get list of milton works and urls. 31 """Get list of milton works and urls. 32 32 33 Results are sorted by work title. 33 Results are sorted by work title. 34 34 35 Notes regarding list of plays:36 37 * no Folio edition of Troilus and Cressida38 * no Folio edition of Pericles39 """ 35 """ 40 # results have format [ title, url, comments ] 36 # results have format [ title, url, comments ] 41 # folio in comments indicates it is a first folio42 results = [ ["Areopagitica", 'http://www.gutenberg.org/files/608/608.txt', ''] ] 37 results = [ ["Areopagitica", 'http://www.gutenberg.org/files/608/608.txt', ''] ] 43 results.append(["L'Allegro, Il Penseroso, Comus, and Lycidas", 38 results.append(["L'Allegro, Il Penseroso, Comus, and Lycidas", 44 'http://www.gutenberg.org/dirs/etext96/miltp10.txt', ''] 39 'http://www.gutenberg.org/dirs/etext96/miltp10.txt', ''] 45 ) 40 ) 46 results.append(["Comus", 41 results.append(["Comus", 47 'http://www.gutenberg.org/files/19819/19819.txt', ''] 42 'http://www.gutenberg.org/files/19819/19819.txt', ''] 48 ) 43 ) 49 results.append(["Paradise Lost (No introduction)", 44 results.append(["Paradise Lost (No introduction)", 50 'http://www.gutenberg.org/dirs/etext91/plboss10.txt', ''] 45 'http://www.gutenberg.org/dirs/etext91/plboss10.txt', ''] 51 ) 46 ) 52 47 53 results.append(["Paradise Regained", 48 results.append(["Paradise Regained", 54 'http://www.gutenberg.org/dirs/etext93/rgain10.txt', ''] 49 'http://www.gutenberg.org/dirs/etext93/rgain10.txt', ''] 55 ) 50 ) 56 51 57 results.append(["Poemata", 52 results.append(["Poemata", 58 'http://www.gutenberg.org/dirs/etext04/poema10.txt', ''] 53 'http://www.gutenberg.org/dirs/etext04/poema10.txt', ''] 59 ) 54 ) 60 55 61 results.append(["Poetical Works", 56 results.append(["Poetical Works", 62 'http://www.gutenberg.org/dirs/etext99/pmsjm10.txt', ''] 57 'http://www.gutenberg.org/dirs/etext99/pmsjm10.txt', ''] 63 ) 58 ) 64 def compare_list(item1, item2): 59 def compare_list(item1, item2): 65 if item1[0] > item2[0]: return 1 60 if item1[0] > item2[0]: return 1 66 else: return -1 61 else: return -1 67 results.sort(compare_list) 62 results.sort(compare_list) 68 return results 63 return results 69 64 70 """ 65 """ 71 Clean up Gutenberg texts by removing all the header and footer bumpf 66 Clean up Gutenberg texts by removing all the header and footer bumpf 72 """ 67 """ 73 68 74 import re 69 import re 75 70 76 headerEndPhrases = ['START OF THIS PROJECT', 'START OF THE PROJECT', 'THE SMALL PRINT! FOR PUBLIC DOMAIN'] 71 headerEndPhrases = ['START OF THIS PROJECT', 'START OF THE PROJECT', 'THE SMALL PRINT! FOR PUBLIC DOMAIN'] 77 notesStartPhrases = ["Executive Director's Notes:"] 72 notesStartPhrases = ["Executive Director's Notes:"] 78 notesEndPhrases = ['Produced by'] 73 notesEndPhrases = ['Produced by'] 79 footerStartPhrases = ['End of Project Gutenberg', 'End of The Project Gutenberg', 'END OF THE PROJECT GUTENBERG EBOOK'] 74 footerStartPhrases = ['End of Project Gutenberg', 'End of The Project Gutenberg', 'END OF THE PROJECT GUTENBERG EBOOK'] 80 75 81 def make_re_from_phrase(phrase): 76 def make_re_from_phrase(phrase): 82 """ 77 """ 83 Make a regular expression that matches a phrase and its surrounding 78 Make a regular expression that matches a phrase and its surrounding 84 paragraph, i.e. that look like: 79 paragraph, i.e. that look like: 85 80 86 ... phrase .... 81 ... phrase .... 87 more text 82 more text 88 [blank] 83 [blank] 89 [blank]+ 84 [blank]+ 90 """ 85 """ 91 # need \S to ensure not just whitespace 86 # need \S to ensure not just whitespace 92 paragraphText = '(.+\S.+\n)*' 87 paragraphText = '(.+\S.+\n)*' 93 # [[TODO: check slowdown due to inclusion of '^.*' at start 88 # [[TODO: check slowdown due to inclusion of '^.*' at start 94 tmp = '^.*' + phrase + '.*\n' + paragraphText + '\s+' 89 tmp = '^.*' + phrase + '.*\n' + paragraphText + '\s+' 95 return re.compile(tmp, re.I | re.M) # make it case insensitive 90 return re.compile(tmp, re.I | re.M) # make it case insensitive 96 91 97 class Gutenbergmilton(object): 92 class Gutenbergmilton(object): 98 """ 93 """ 99 Process Gutenberg Milton texts 94 Process Gutenberg Milton texts 100 """ 95 """ 101 96 102 def __init__(self, etext): 97 def __init__(self, etext): 103 """ 98 """ 104 @param etext: file like object containing the etext 99 @param etext: file like object containing the etext 105 100 106 Procedure: 101 Procedure: 107 1. strip out header and footer bumpf 102 1. strip out header and footer bumpf 108 2. are there notes? If so strip them out 103 2. are there notes? If so strip them out 109 """ 104 """ 110 self.etext = etext 105 self.etext = etext 111 # most milton texts are either ascii or latin-1 106 # most milton texts are either ascii or latin-1 112 self.etextStr = unicode(self.etext.read(), 'latin-1').encode('utf-8') 107 self.etextStr = unicode(self.etext.read(), 'latin-1').encode('utf-8') 113 # normalize the line endings to save us grief later 108 # normalize the line endings to save us grief later 114 self.etextStr = self.etextStr.replace('\r\n', '\n') 109 self.etextStr = self.etextStr.replace('\r\n', '\n') 115 self.hasNotes = False 110 self.hasNotes = False 116 111 117 def _find_max(self, phrase, string): 112 def _find_max(self, phrase, string): 118 maxIndex = 0 113 maxIndex = 0 119 regex = make_re_from_phrase(phrase) 114 regex = make_re_from_phrase(phrase) 120 matches = regex.finditer(string) 115 matches = regex.finditer(string) 121 for match in matches: 116 for match in matches: 122 maxIndex = max(match.end(), maxIndex) 117 maxIndex = max(match.end(), maxIndex) 123 return maxIndex 118 return maxIndex 124 119 125 def _find_min(self, phrase, string): 120 def _find_min(self, phrase, string): 126 minIndex = len(string) 121 minIndex = len(string) 127 regex = make_re_from_phrase(phrase) 122 regex = make_re_from_phrase(phrase) 128 matches = regex.finditer(string) 123 matches = regex.finditer(string) 129 for match in matches: 124 for match in matches: 130 minIndex = min(match.start(), minIndex) 125 minIndex = min(match.start(), minIndex) 131 return minIndex 126 return minIndex 132 127 133 def extract_text(self): 128 def extract_text(self): 134 """Extract the core text. 129 """Extract the core text. 135 """ 130 """ 136 self.notesEnd = self.get_notes_end() 131 self.notesEnd = self.get_notes_end() 137 self.footerStart = self.get_footer_start() 132 self.footerStart = self.get_footer_start() 138 self.headerEnd = self.get_header_end() 133 self.headerEnd = self.get_header_end() 139 startIndex = self.headerEnd 134 startIndex = self.headerEnd 140 if self.notesEnd > 0: 135 if self.notesEnd > 0: 141 startIndex = self.notesEnd 136 startIndex = self.notesEnd 142 return self.etextStr[startIndex : self.footerStart].rstrip() 137 return self.etextStr[startIndex : self.footerStart].rstrip() 143 138 144 def get_notes_end(self): 139 def get_notes_end(self): 145 "Return 0 if no notes" 140 "Return 0 if no notes" 146 indices = [ self._find_min(phrase, self.etextStr) for phrase in notesEndPhrases]
