# HG changeset patch # User Marcin Kuzminski # Date 1283000012 -7200 # Node ID 28f19fa562dfe57dc6e2d655a76b05b38c45f363 # Parent 0e8ef6f17203aa6fad7be5049efb9519f9b765f7 updated config files, Implemented content index extensions with whoosh, fixed analyzer to match more words diff -r 0e8ef6f17203 -r 28f19fa562df development.ini --- a/development.ini Fri Aug 27 22:28:50 2010 +0200 +++ b/development.ini Sat Aug 28 14:53:32 2010 +0200 @@ -52,6 +52,26 @@ beaker.cache.super_short_term.type=memory beaker.cache.super_short_term.expire=10 +#################################### +### BEAKER SESSION #### +#################################### +## Type of storage used for the session, current types are +## “dbm”, “file”, “memcached”, “database”, and “memory”. +## The storage uses the Container API +##that is also used by the cache system. +beaker.session.type = file + +beaker.session.key = hg-app +beaker.session.secret = g654dcno0-9873jhgfreyu +beaker.session.timeout = 36000 + +##auto save the session to not to use .save() +beaker.session.auto = False + +##true exire at browser close +#beaker.session.cookie_expires = 3600 + + ################################################################################ ## WARNING: *THE LINE BELOW MUST BE UNCOMMENTED ON A PRODUCTION ENVIRONMENT* ## ## Debug mode will enable the interactive debugging tool, allowing ANYONE to ## diff -r 0e8ef6f17203 -r 28f19fa562df production.ini --- a/production.ini Fri Aug 27 22:28:50 2010 +0200 +++ b/production.ini Sat Aug 28 14:53:32 2010 +0200 @@ -51,6 +51,26 @@ beaker.cache.short_term.expire=60 beaker.cache.super_short_term.type=memory beaker.cache.super_short_term.expire=10 + +#################################### +### BEAKER SESSION #### +#################################### +## Type of storage used for the session, current types are +## “dbm”, “file”, “memcached”, “database”, and “memory”. +## The storage uses the Container API +##that is also used by the cache system. +beaker.session.type = file + +beaker.session.key = hg-app +beaker.session.secret = g654dcno0-9873jhgfreyu +beaker.session.timeout = 36000 + +##auto save the session to not to use .save() +beaker.session.auto = False + +##true exire at browser close +#beaker.session.cookie_expires = 3600 + ################################################################################ ## WARNING: *THE LINE BELOW MUST BE UNCOMMENTED ON A PRODUCTION ENVIRONMENT* ## diff -r 0e8ef6f17203 -r 28f19fa562df pylons_app/lib/indexers/__init__.py --- a/pylons_app/lib/indexers/__init__.py Fri Aug 27 22:28:50 2010 +0200 +++ b/pylons_app/lib/indexers/__init__.py Sat Aug 28 14:53:32 2010 +0200 @@ -19,18 +19,23 @@ #LOCATION WE KEEP THE INDEX IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index') -#EXTENSION TO SKIP READING CONTENT ON -EXCLUDE_EXTENSIONS = ['pyc', 'mo', 'png', 'jpg', 'jpeg', 'gif', 'swf', - 'dll', 'ttf', 'psd', 'svg', 'pdf', 'bmp', 'dll'] +#EXTENSIONS WE WANT TO INDEX CONTENT OFF +INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c', + 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', 'h', + 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp', + 'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3', + 'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql', + 'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml','xsl','xslt', + 'yaws'] #CUSTOM ANALYZER wordsplit + lowercase filter -ANALYZER = RegexTokenizer() | LowercaseFilter() +ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() #INDEX SCHEMA DEFINITION SCHEMA = Schema(owner=TEXT(), repository=TEXT(stored=True), path=ID(stored=True, unique=True), content=TEXT(stored=True, analyzer=ANALYZER), - modtime=STORED()) + modtime=STORED(),extension=TEXT(stored=True)) -IDX_NAME = 'HG_INDEX' +IDX_NAME = 'HG_INDEX' \ No newline at end of file diff -r 0e8ef6f17203 -r 28f19fa562df pylons_app/lib/indexers/daemon.py --- a/pylons_app/lib/indexers/daemon.py Fri Aug 27 22:28:50 2010 +0200 +++ b/pylons_app/lib/indexers/daemon.py Sat Aug 28 14:53:32 2010 +0200 @@ -38,7 +38,7 @@ from pylons_app.model.hg_model import HgModel from whoosh.index import create_in, open_dir from shutil import rmtree -from pylons_app.lib.indexers import ANALYZER, EXCLUDE_EXTENSIONS, IDX_LOCATION, \ +from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \ SCHEMA, IDX_NAME import logging @@ -70,8 +70,10 @@ def add_doc(self, writer, path, repo): """Adding doc to writer""" - #we don't won't to read excluded file extensions just index them - if path.split('/')[-1].split('.')[-1].lower() not in EXCLUDE_EXTENSIONS: + ext = unicode(path.split('/')[-1].split('.')[-1].lower()) + #we just index the content of choosen files + if ext in INDEX_EXTENSIONS: + log.debug(' >> %s [WITH CONTENT]' % path) fobj = open(path, 'rb') content = fobj.read() fobj.close() @@ -81,15 +83,20 @@ #incase we have a decode error just represent as byte string u_content = unicode(str(content).encode('string_escape')) else: - u_content = u'' + log.debug(' >> %s' % path) + #just index file name without it's content + u_content = u'' + writer.add_document(owner=unicode(repo.contact), repository=u"%s" % repo.name, path=u"%s" % path, content=u_content, - modtime=os.path.getmtime(path)) + modtime=os.path.getmtime(path), + extension=ext) def build_index(self): if os.path.exists(IDX_LOCATION): + log.debug('removing previos index') rmtree(IDX_LOCATION) if not os.path.exists(IDX_LOCATION): @@ -102,7 +109,6 @@ log.debug('building index @ %s' % repo.path) for idx_path in self.get_paths(repo.path): - log.debug(' >> %s' % idx_path) self.add_doc(writer, idx_path, repo) writer.commit(merge=True) @@ -170,11 +176,12 @@ self.update_index() if __name__ == "__main__": - repo_location = '/home/marcink/python_workspace_dirty/*' - + repo_location = '/home/marcink/hg_repos/*' + full_index = True # False means looking just for changes try: l = DaemonLock() - WhooshIndexingDaemon(repo_location=repo_location).run(full_index=True) + WhooshIndexingDaemon(repo_location=repo_location)\ + .run(full_index=full_index) l.release() except LockHeld: sys.exit(1)