changeset 406:b153a51b1d3b

Implemented search using whoosh. Still as experimental option.
author Marcin Kuzminski <>
date Tue, 17 Aug 2010 23:15:36 +0200
parents bec06654d67b
children 0c9dfae57107
files pylons_app/config/ pylons_app/controllers/ pylons_app/controllers/ pylons_app/lib/indexers/ pylons_app/lib/indexers/ pylons_app/lib/indexers/ pylons_app/lib/indexers/ pylons_app/templates/search/search.html pylons_app/tests/functional/
diffstat 10 files changed, 714 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/pylons_app/config/	Tue Aug 17 22:29:17 2010 +0200
+++ b/pylons_app/config/	Tue Aug 17 23:15:36 2010 +0200
@@ -108,6 +108,8 @@
         m.connect('admin_home', '', action='index')#main page
         m.connect('admin_add_repo', '/add_repo/{new_repo:[a-z0-9\. _-]*}',
+    #SEARCH
+    map.connect('search', '/_admin/search', controller='search')
     map.connect('login_home', '/_admin/login', controller='login')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylons_app/controllers/	Tue Aug 17 23:15:36 2010 +0200
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+# encoding: utf-8
+# search controller for pylons
+# Copyright (C) 2009-2010 Marcin Kuzminski <>
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; version 2
+# of the License or (at your opinion) any later version of the license.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+Created on Aug 7, 2010
+search controller for pylons
+@author: marcink
+from pylons import request, response, session, tmpl_context as c, url
+from pylons.controllers.util import abort, redirect
+from pylons_app.lib.auth import LoginRequired
+from pylons_app.lib.base import BaseController, render
+from pylons_app.lib.indexers import ANALYZER, IDX_LOCATION, SCHEMA
+from webhelpers.html.builder import escape
+from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter, \
+    ContextFragmenter
+from whoosh.index import open_dir, EmptyIndexError
+from whoosh.qparser import QueryParser, QueryParserError
+from whoosh.query import Phrase
+import logging
+import traceback
+log = logging.getLogger(__name__)
+class SearchController(BaseController):
+    @LoginRequired()
+    def __before__(self):
+        super(SearchController, self).__before__()    
+    def index(self):
+        c.formated_results = []
+        c.runtime = ''
+        search_items = set()
+        c.cur_query = request.GET.get('q', None)
+        if c.cur_query:
+            cur_query = c.cur_query.lower()
+        if c.cur_query:
+            try:
+                idx = open_dir(IDX_LOCATION, indexname='HG_INDEX')
+                searcher = idx.searcher()
+                qp = QueryParser("content", schema=SCHEMA)
+                try:
+                    query = qp.parse(unicode(cur_query))
+                    if isinstance(query, Phrase):
+                        search_items.update(query.words)
+                    else:
+                        for i in query.all_terms():
+                            search_items.add(i[1])
+                    log.debug(query)
+                    log.debug(search_items)
+                    results =
+                    c.runtime = '%s results (%.3f seconds)' \
+                    % (len(results), results.runtime)
+                    analyzer = ANALYZER
+                    formatter = HtmlFormatter('span',
+                        between='\n<span class="break">...</span>\n') 
+                    #how the parts are splitted within the same text part
+                    fragmenter = SimpleFragmenter(200)
+                    #fragmenter = ContextFragmenter(search_items)
+                    for res in results:
+                        d = {}
+                        d.update(res)
+                        hl = highlight(escape(res['content']), search_items,
+                                                         analyzer=analyzer,
+                                                         fragmenter=fragmenter,
+                                                         formatter=formatter,
+                                                         top=5)
+                        f_path = res['path'][res['path'].find(res['repository']) \
+                                             + len(res['repository']):].lstrip('/')
+                        d.update({'content_short':hl,
+                                  'f_path':f_path})
+                        #del d['content']
+                        c.formated_results.append(d)
+                except QueryParserError:
+                    c.runtime = 'Invalid search query. Try quoting it.'
+            except (EmptyIndexError, IOError):
+                log.error(traceback.format_exc())
+                log.error('Empty Index data')
+                c.runtime = 'There is no index to search in. Please run whoosh indexer'
+        # Return a rendered template
+        return render('/search/search.html')
--- a/pylons_app/controllers/	Tue Aug 17 22:29:17 2010 +0200
+++ b/pylons_app/controllers/	Tue Aug 17 23:15:36 2010 +0200
@@ -113,7 +113,7 @@
                           % (author.decode('utf8'),
                         [[x, aggregate[author][x]] for x in aggregate[author]]))
         if d == '':
-            d = '"%s":{label:"%s",data:[[0,0],]}' \
+            d = '"%s":{label:"%s",data:[[0,1],]}' \
                 % (author_key_cleaner(,
         return d
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylons_app/lib/indexers/	Tue Aug 17 23:15:36 2010 +0200
@@ -0,0 +1,36 @@
+import sys
+import os
+from pidlock import LockHeld, DaemonLock
+import traceback
+from os.path import dirname as dn
+from os.path import join as jn
+#to get the pylons_app import
+from pylons_app.config.environment import load_environment
+from pylons_app.model.hg_model import HgModel
+from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
+from whoosh.fields import TEXT, ID, STORED, Schema
+from whoosh.index import create_in, open_dir
+from shutil import rmtree
+IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')
+EXCLUDE_EXTENSIONS = ['pyc', 'mo', 'png', 'jpg', 'jpeg', 'gif', 'swf',
+                       'dll', 'ttf', 'psd', 'svg', 'pdf', 'bmp', 'dll']
+#CUSTOM ANALYZER wordsplit + lowercase filter
+ANALYZER = RegexTokenizer() | LowercaseFilter()
+SCHEMA = Schema(owner=TEXT(),
+                repository=TEXT(stored=True),
+                path=ID(stored=True, unique=True),
+                content=TEXT(stored=True, analyzer=ANALYZER),
+                modtime=STORED())
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylons_app/lib/indexers/	Tue Aug 17 23:15:36 2010 +0200
@@ -0,0 +1,181 @@
+#!/usr/bin/env python
+# encoding: utf-8
+# whoosh indexer daemon for hg-app
+# Copyright (C) 2009-2010 Marcin Kuzminski <>
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; version 2
+# of the License or (at your opinion) any later version of the license.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+Created on Jan 26, 2010
+@author: marcink
+A deamon will read from task table and run tasks
+import sys
+import os
+from pidlock import LockHeld, DaemonLock
+import traceback
+from os.path import dirname as dn
+from os.path import join as jn
+#to get the pylons_app import
+from pylons_app.config.environment import load_environment
+from pylons_app.model.hg_model import HgModel
+from whoosh.index import create_in, open_dir
+from shutil import rmtree
+import logging
+log = logging.getLogger(__name__)
+location = '/home/marcink/python_workspace_dirty/*'
+def scan_paths(root_location):
+    return HgModel.repo_scan('/', root_location, None, True)
+class WhooshIndexingDaemon(object):
+    """Deamon for atomic jobs"""
+    def __init__(self, indexname='HG_INDEX'):
+        self.indexname = indexname
+    def get_paths(self, root_dir):
+        """recursive walk in root dir and return a set of all path in that dir
+        excluding files in .hg dir"""
+        index_paths_ = set()
+        for path, dirs, files in os.walk(root_dir):
+            if path.find('.hg') == -1:
+                for f in files:
+                    index_paths_.add(jn(path, f))
+        return index_paths_
+    def add_doc(self, writer, path, repo):
+        """Adding doc to writer"""
+        #we don't won't to read excluded file extensions just index them
+        if path.split('/')[-1].split('.')[-1].lower() not in EXCLUDE_EXTENSIONS:
+            fobj = open(path, 'rb')
+            content =
+            fobj.close()
+            try:
+                u_content = unicode(content)
+            except UnicodeDecodeError:
+                #incase we have a decode error just represent as byte string
+                u_content = unicode(str(content).encode('string_escape'))
+        else:
+            u_content = u''    
+        writer.add_document(owner=unicode(,
+                            repository=u"%s" %,
+                            path=u"%s" % path,
+                            content=u_content,
+                            modtime=os.path.getmtime(path)) 
+    def build_index(self):
+        if os.path.exists(IDX_LOCATION):
+            rmtree(IDX_LOCATION)
+        if not os.path.exists(IDX_LOCATION):
+            os.mkdir(IDX_LOCATION)
+        idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
+        writer = idx.writer()
+        for cnt, repo in enumerate(scan_paths(location).values()):
+            log.debug('building index @ %s' % repo.path)
+            for idx_path in self.get_paths(repo.path):
+                log.debug('    >> %s' % idx_path)
+                self.add_doc(writer, idx_path, repo)
+        writer.commit(merge=True)
+        log.debug('>>> FINISHED BUILDING INDEX <<<')
+    def update_index(self):
+        idx = open_dir(IDX_LOCATION, indexname=self.indexname)
+        # The set of all paths in the index
+        indexed_paths = set()
+        # The set of all paths we need to re-index
+        to_index = set()
+        reader = idx.reader()
+        writer = idx.writer()
+        # Loop over the stored fields in the index
+        for fields in reader.all_stored_fields():
+            indexed_path = fields['path']
+            indexed_paths.add(indexed_path)
+            if not os.path.exists(indexed_path):
+                # This file was deleted since it was indexed
+                log.debug('removing from index %s' % indexed_path)
+                writer.delete_by_term('path', indexed_path)
+            else:
+                # Check if this file was changed since it
+                # was indexed
+                indexed_time = fields['modtime']
+                mtime = os.path.getmtime(indexed_path)
+                if mtime > indexed_time:
+                    # The file has changed, delete it and add it to the list of
+                    # files to reindex
+                    log.debug('adding to reindex list %s' % indexed_path)
+                    writer.delete_by_term('path', indexed_path)
+                    to_index.add(indexed_path)
+                    #writer.commit()
+        # Loop over the files in the filesystem
+        # Assume we have a function that gathers the filenames of the
+        # documents to be indexed
+        for repo in scan_paths(location).values():
+            for path in self.get_paths(repo.path):
+                if path in to_index or path not in indexed_paths:
+                    # This is either a file that's changed, or a new file
+                    # that wasn't indexed before. So index it!
+                    self.add_doc(writer, path, repo)
+                    log.debug('reindexing %s' % path)
+        writer.commit(merge=True)
+        #idx.optimize()
+        log.debug('>>> FINISHED <<<')
+    def run(self, full_index=False):
+        """Run daemon"""
+        if full_index:
+            self.build_index()
+        else:
+            self.update_index()
+if __name__ == "__main__":
+    #config = load_environment()
+    #print config
+    try:
+        l = DaemonLock()
+        WhooshIndexingDaemon().run(full_index=True)
+        l.release()
+    except LockHeld:
+        sys.exit(1)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylons_app/lib/indexers/	Tue Aug 17 23:15:36 2010 +0200
@@ -0,0 +1,176 @@
+from multiprocessing import Process, Queue, cpu_count, Lock
+import socket, sys
+import time
+import os
+import sys
+from os.path import dirname as dn
+from multiprocessing.dummy import current_process
+from shutil import rmtree
+from pylons_app.model.hg_model import HgModel
+from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
+from whoosh.fields import TEXT, ID, STORED, Schema
+from whoosh.index import create_in, open_dir
+from datetime import datetime
+from multiprocessing.process import current_process
+from multiprocessing import Array, Value
+root = dn(dn(os.path.dirname(os.path.abspath(__file__))))
+idx_location = os.path.join(root, 'data', 'index')
+root_path = '/home/marcink/python_workspace_dirty/*'
+exclude_extensions = ['pyc', 'mo', 'png', 'jpg', 'jpeg', 'gif', 'swf',
+                       'dll', 'ttf', 'psd', 'svg', 'pdf', 'bmp', 'dll']
+my_analyzer = RegexTokenizer() | LowercaseFilter()
+def scan_paths(root_location):
+    return HgModel.repo_scan('/', root_location, None, True)
+def index_paths(root_dir):
+    index_paths_ = set()
+    for path, dirs, files in os.walk(root_dir):
+        if path.find('.hg') == -1:
+        #if path.find('.hg') == -1 and path.find('bel-epa') != -1:    
+            for f in files:
+                index_paths_.add(os.path.join(path, f))
+    return index_paths_
+def get_schema():
+    return Schema(owner=TEXT(),
+                repository=TEXT(stored=True),
+                path=ID(stored=True, unique=True),
+                content=TEXT(stored=True, analyzer=my_analyzer),
+                modtime=STORED())
+def add_doc(writer, path, repo_name, contact):
+    """
+    Adding doc to writer
+    @param writer:
+    @param path:
+    @param repo:
+    @param fname:
+    """
+    #we don't won't to read excluded file extensions just index them
+    if path.split('/')[-1].split('.')[-1].lower() not in exclude_extensions:
+        fobj = open(path, 'rb')
+        content =
+        fobj.close()
+        try:
+            u_content = unicode(content)
+        except UnicodeDecodeError:
+            #incase we have a decode error just represent as byte string
+            u_content = unicode(str(content).encode('string_escape'))
+    else:
+        u_content = u''    
+    writer.add_document(repository=u"%s" % repo_name,
+                        owner=unicode(contact),
+                        path=u"%s" % path,
+                        content=u_content,
+                        modtime=os.path.getmtime(path)) 
+class MultiProcessIndexer(object):
+    """ multiprocessing whoosh indexer """
+    def __init__(self, idx, work_set=set(), nr_processes=cpu_count()):
+        q = Queue()
+        l = Lock()
+        work_set = work_set
+        writer = None
+        #writer = idx.writer()
+        for q_task in work_set:
+            q.put(q_task)
+        q.put('COMMIT')
+        #to stop all processes we have to put STOP to queue and 
+        #break the loop for each process
+        for _ in xrange(nr_processes):
+            q.put('STOP')
+        for _ in xrange(nr_processes):
+            p = Process(target=self.work_func, args=(q, l, idx, writer))
+            p.start()
+    def work_func(self, q, l, idx, writer):
+        """ worker class invoked by process """
+        writer = idx.writer()
+        while True:
+            q_task = q.get()
+            proc = current_process()
+#            if q_task == 'COMMIT':
+#                l.acquire()
+#                sys.stdout.write('%s commiting and STOP\n' % proc._name)
+#                writer.commit(merge=False)
+#                l.release()               
+#                break
+#            l.acquire()
+#            writer = idx.writer()
+#            l.release() 
+            if q_task == 'STOP':
+                sys.stdout.write('%s STOP\n' % proc._name)  
+                break
+            if q_task != 'COMMIT':
+                l.acquire()
+                sys.stdout.write('    >> %s %s %s @ ' % q_task)
+                sys.stdout.write(' %s \n' % proc._name)
+                l.release()
+                add_doc(writer, q_task[0], q_task[1], q_task[2])
+            l.acquire()
+            writer.commit(merge=True)
+            l.release()
+if __name__ == "__main__":
+    #build queue
+    do = True if len(sys.argv) > 1 else False
+    q_tasks = []
+    if os.path.exists(idx_location):
+        rmtree(idx_location)
+    if not os.path.exists(idx_location):
+        os.mkdir(idx_location)
+    idx = create_in(idx_location, get_schema() , indexname='HG_INDEX')    
+    if do:
+        sys.stdout.write('Building queue...')
+        for cnt, repo in enumerate(scan_paths(root_path).values()):
+            if != 'evoice_py':
+                continue            
+            q_tasks.extend([(idx_path,, for idx_path in index_paths(repo.path)])
+            if cnt == 4:
+                break
+        sys.stdout.write('done\n')
+        mpi = MultiProcessIndexer(idx, q_tasks)
+    else:
+        print 'checking index'
+        reader = idx.reader()
+        all = reader.all_stored_fields()
+        #print all
+        for fields in all:
+            print fields['path']
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylons_app/lib/indexers/	Tue Aug 17 23:15:36 2010 +0200
@@ -0,0 +1,127 @@
+import os, time
+import sys
+from warnings import warn
+class LockHeld(Exception):pass
+class DaemonLock(object):
+    '''daemon locking
+    USAGE:
+    try:
+        l = lock()
+        main()
+        l.release()
+    except LockHeld:
+        sys.exit(1)
+    '''
+    def __init__(self, file=None, callbackfn=None,
+                 desc='daemon lock', debug=False):
+        self.pidfile = file if file else os.path.join(os.path.dirname(__file__),
+                                                      'running.lock')
+        self.callbackfn = callbackfn
+        self.desc = desc
+        self.debug = debug
+        self.held = False
+        #run the lock automatically !
+        self.lock()
+    def __del__(self):
+        if self.held:
+#            warn("use lock.release instead of del lock",
+#                    category = DeprecationWarning,
+#                    stacklevel = 2)
+            # ensure the lock will be removed
+            self.release()
+    def lock(self):
+        '''
+        locking function, if lock is present it will raise LockHeld exception
+        '''
+        lockname = '%s' % (os.getpid())
+        self.trylock()
+        self.makelock(lockname, self.pidfile)
+        return True
+    def trylock(self):
+        running_pid = False
+        try:
+            pidfile = open(self.pidfile, "r")
+            running_pid = pidfile.readline()
+            if self.debug:
+                print 'lock file present running_pid: %s, checking for execution'\
+                % running_pid
+            # Now we check the PID from lock file matches to the current
+            # process PID
+            if running_pid:
+                if os.path.exists("/proc/%s" % running_pid):
+                        print "You already have an instance of the program running"
+                        print "It is running as process %s" % running_pid
+                        raise LockHeld
+                else:
+                        print "Lock File is there but the program is not running"
+                        print "Removing lock file for the: %s" % running_pid
+                        self.release()
+        except IOError, e:
+            if e.errno != 2:
+                raise
+    def release(self):
+        '''
+        releases the pid by removing the pidfile
+        '''
+        if self.callbackfn:
+            #execute callback function on release
+            if self.debug:
+                print 'executing callback function %s' % self.callbackfn
+            self.callbackfn()
+        try:
+            if self.debug:
+                print 'removing pidfile %s' % self.pidfile
+            os.remove(self.pidfile)
+            self.held = False
+        except OSError, e:
+            if self.debug:
+                print 'removing pidfile failed %s' % e
+            pass
+    def makelock(self, lockname, pidfile):
+        '''
+        this function will make an actual lock
+        @param lockname: acctual pid of file
+        @param pidfile: the file to write the pid in
+        '''
+        if self.debug:
+            print 'creating a file %s and pid: %s' % (pidfile, lockname)
+        pidfile = open(self.pidfile, "wb")
+        pidfile.write(lockname)
+        pidfile.close
+        self.held = True
+def main():
+    print 'func is running'
+    cnt = 20
+    while 1:
+        print cnt
+        if cnt == 0:
+            break
+        time.sleep(1)
+        cnt -= 1
+if __name__ == "__main__":
+    try:
+        l = DaemonLock(desc='test lock')
+        main()
+        l.release()
+    except LockHeld:
+        sys.exit(1)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylons_app/templates/search/search.html	Tue Aug 17 23:15:36 2010 +0200
@@ -0,0 +1,69 @@
+## -*- coding: utf-8 -*-
+<%inherit file="/base/base.html"/>
+<%def name="title()">
+   ${_('Search')}: ${c.cur_query}
+<%def name="breadcrumbs()">
+	${c.hg_app_name}
+<%def name="page_nav()">
+	${'home')}
+<%def name="main()">
+<div class="box">
+	<!-- box / title -->
+	<div class="title">
+		<h5>${_('Search')}</h5>
+	</div>
+	<!-- end box / title -->
+	${h.form('search',method='get')}
+	<div class="form">
+		<div class="fields">
+			<div class="field ">
+				<div class="label">
+					<label for="q">${_('Search:')}</label>
+				</div>
+				<div class="input">
+					${h.text('q',c.cur_query,class_="small")}
+					<div class="button highlight">
+						<input type="submit" value="${_('Search')}" class="ui-button ui-widget ui-state-default ui-corner-all"/>
+					</div>		
+					<div style="font-weight: bold;clear:both;padding: 5px">${c.runtime}</div>			
+				</div>
+			</div>
+		</div>
+	</div>
+	${h.end_form()}
+	%for cnt,sr in enumerate(c.formated_results):
+		%if h.HasRepoPermissionAny('repository.write','','repository.admin')(sr['repository'],'search results check'):
+		<div class="table">
+			<div id="body${cnt}" class="codeblock">
+				<div class="code-header">
+					<div class="revision">${h.link_to(h.literal('%s &raquo; %s' % (sr['repository'],sr['f_path'])),
+					h.url('files_home',repo_name=sr['repository'],revision='tip',f_path=sr['f_path']))}</div>
+				</div>
+				<div class="code-body">
+					<pre>${h.literal(sr['content_short'])}</pre>
+				</div>
+			</div>
+		</div>
+		%else:
+			%if cnt == 0:
+			<div class="table">
+				<div id="body${cnt}" class="codeblock">
+					<div class="error">${_('Permission denied')}</div>
+				</div>
+			</div>		
+			%endif
+		%endif
+	%endfor
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylons_app/tests/functional/	Tue Aug 17 23:15:36 2010 +0200
@@ -0,0 +1,7 @@
+from pylons_app.tests import *
+class TestSearchController(TestController):
+    def test_index(self):
+        response ='search', action='index'))
+        # Test response...
--- a/	Tue Aug 17 22:29:17 2010 +0200
+++ b/	Tue Aug 17 23:15:36 2010 +0200
@@ -7,7 +7,7 @@
     from setuptools import setup, find_packages
-    name='pylons_app',
+    name='hg_app',
     description='Mercurial repository serving and browsing app',
     keywords='mercurial web hgwebdir replacement serving hgweb',
@@ -22,7 +22,8 @@
-        "pysqlite"
+        "pysqlite",
+        "whoosh==1.0.0b5",