changeset 436:28f19fa562df

updated config files, Implemented content index extensions with whoosh, fixed analyzer to match more words
author Marcin Kuzminski <marcin@python-works.com>
date Sat, 28 Aug 2010 14:53:32 +0200
parents 0e8ef6f17203
children 930f8182a884
files development.ini production.ini pylons_app/lib/indexers/__init__.py pylons_app/lib/indexers/daemon.py
diffstat 4 files changed, 67 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/development.ini	Fri Aug 27 22:28:50 2010 +0200
+++ b/development.ini	Sat Aug 28 14:53:32 2010 +0200
@@ -52,6 +52,26 @@
 beaker.cache.super_short_term.type=memory
 beaker.cache.super_short_term.expire=10
 
+####################################
+###       BEAKER SESSION        ####
+####################################
+## Type of storage used for the session, current types are 
+## “dbm”, “file”, “memcached”, “database”, and “memory”. 
+## The storage uses the Container API 
+##that is also used by the cache system.
+beaker.session.type = file
+
+beaker.session.key = hg-app
+beaker.session.secret = g654dcno0-9873jhgfreyu
+beaker.session.timeout = 36000
+
+##auto save the session to not to use .save()
+beaker.session.auto = False
+
+##true exire at browser close
+#beaker.session.cookie_expires = 3600
+
+    
 ################################################################################
 ## WARNING: *THE LINE BELOW MUST BE UNCOMMENTED ON A PRODUCTION ENVIRONMENT*  ##
 ## Debug mode will enable the interactive debugging tool, allowing ANYONE to  ##
--- a/production.ini	Fri Aug 27 22:28:50 2010 +0200
+++ b/production.ini	Sat Aug 28 14:53:32 2010 +0200
@@ -51,6 +51,26 @@
 beaker.cache.short_term.expire=60
 beaker.cache.super_short_term.type=memory
 beaker.cache.super_short_term.expire=10
+
+####################################
+###       BEAKER SESSION        ####
+####################################
+## Type of storage used for the session, current types are 
+## “dbm”, “file”, “memcached”, “database”, and “memory”. 
+## The storage uses the Container API 
+##that is also used by the cache system.
+beaker.session.type = file
+
+beaker.session.key = hg-app
+beaker.session.secret = g654dcno0-9873jhgfreyu
+beaker.session.timeout = 36000
+
+##auto save the session to not to use .save()
+beaker.session.auto = False
+
+##true exire at browser close
+#beaker.session.cookie_expires = 3600
+
     
 ################################################################################
 ## WARNING: *THE LINE BELOW MUST BE UNCOMMENTED ON A PRODUCTION ENVIRONMENT*  ##
--- a/pylons_app/lib/indexers/__init__.py	Fri Aug 27 22:28:50 2010 +0200
+++ b/pylons_app/lib/indexers/__init__.py	Sat Aug 28 14:53:32 2010 +0200
@@ -19,18 +19,23 @@
 #LOCATION WE KEEP THE INDEX
 IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')
 
-#EXTENSION TO SKIP READING CONTENT ON
-EXCLUDE_EXTENSIONS = ['pyc', 'mo', 'png', 'jpg', 'jpeg', 'gif', 'swf',
-                       'dll', 'ttf', 'psd', 'svg', 'pdf', 'bmp', 'dll']
+#EXTENSIONS WE WANT TO INDEX CONTENT OFF
+INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c', 
+                    'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', 'h', 
+                    'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp', 
+                    'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3', 
+                    'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql', 
+                    'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml','xsl','xslt', 
+                    'yaws']
 
 #CUSTOM ANALYZER wordsplit + lowercase filter
-ANALYZER = RegexTokenizer() | LowercaseFilter()
+ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
 
 #INDEX SCHEMA DEFINITION
 SCHEMA = Schema(owner=TEXT(),
                 repository=TEXT(stored=True),
                 path=ID(stored=True, unique=True),
                 content=TEXT(stored=True, analyzer=ANALYZER),
-                modtime=STORED())
+                modtime=STORED(),extension=TEXT(stored=True))
 
-IDX_NAME = 'HG_INDEX'
+IDX_NAME = 'HG_INDEX'
\ No newline at end of file
--- a/pylons_app/lib/indexers/daemon.py	Fri Aug 27 22:28:50 2010 +0200
+++ b/pylons_app/lib/indexers/daemon.py	Sat Aug 28 14:53:32 2010 +0200
@@ -38,7 +38,7 @@
 from pylons_app.model.hg_model import HgModel
 from whoosh.index import create_in, open_dir
 from shutil import rmtree
-from pylons_app.lib.indexers import ANALYZER, EXCLUDE_EXTENSIONS, IDX_LOCATION, \
+from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \
 SCHEMA, IDX_NAME
 
 import logging
@@ -70,8 +70,10 @@
     def add_doc(self, writer, path, repo):
         """Adding doc to writer"""
         
-        #we don't won't to read excluded file extensions just index them
-        if path.split('/')[-1].split('.')[-1].lower() not in EXCLUDE_EXTENSIONS:
+        ext = unicode(path.split('/')[-1].split('.')[-1].lower())
+        #we just index the content of choosen files
+        if ext in INDEX_EXTENSIONS:
+            log.debug('    >> %s [WITH CONTENT]' % path)
             fobj = open(path, 'rb')
             content = fobj.read()
             fobj.close()
@@ -81,15 +83,20 @@
                 #incase we have a decode error just represent as byte string
                 u_content = unicode(str(content).encode('string_escape'))
         else:
-            u_content = u''    
+            log.debug('    >> %s' % path)
+            #just index file name without it's content
+            u_content = u''
+                
         writer.add_document(owner=unicode(repo.contact),
                             repository=u"%s" % repo.name,
                             path=u"%s" % path,
                             content=u_content,
-                            modtime=os.path.getmtime(path)) 
+                            modtime=os.path.getmtime(path),
+                            extension=ext) 
     
     def build_index(self):
         if os.path.exists(IDX_LOCATION):
+            log.debug('removing previos index')
             rmtree(IDX_LOCATION)
             
         if not os.path.exists(IDX_LOCATION):
@@ -102,7 +109,6 @@
             log.debug('building index @ %s' % repo.path)
         
             for idx_path in self.get_paths(repo.path):
-                log.debug('    >> %s' % idx_path)
                 self.add_doc(writer, idx_path, repo)
         writer.commit(merge=True)
                 
@@ -170,11 +176,12 @@
             self.update_index()
         
 if __name__ == "__main__":
-    repo_location = '/home/marcink/python_workspace_dirty/*'
-    
+    repo_location = '/home/marcink/hg_repos/*'
+    full_index = True # False means looking just for changes
     try:
         l = DaemonLock()
-        WhooshIndexingDaemon(repo_location=repo_location).run(full_index=True)
+        WhooshIndexingDaemon(repo_location=repo_location)\
+            .run(full_index=full_index)
         l.release()
     except LockHeld:
         sys.exit(1)