# HG changeset patch # User Marcin Kuzminski # Date 1282079736 -7200 # Node ID b153a51b1d3b7b1e5c147039a41223e4df48576a # Parent bec06654d67b01e8eb82d26025c80608dcab07f1 Implemented search using whoosh. Still as experimental option. diff -r bec06654d67b -r b153a51b1d3b pylons_app/config/routing.py --- a/pylons_app/config/routing.py Tue Aug 17 22:29:17 2010 +0200 +++ b/pylons_app/config/routing.py Tue Aug 17 23:15:36 2010 +0200 @@ -108,6 +108,8 @@ m.connect('admin_home', '', action='index')#main page m.connect('admin_add_repo', '/add_repo/{new_repo:[a-z0-9\. _-]*}', action='add_repo') + #SEARCH + map.connect('search', '/_admin/search', controller='search') #LOGIN/LOGOUT map.connect('login_home', '/_admin/login', controller='login') diff -r bec06654d67b -r b153a51b1d3b pylons_app/controllers/search.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylons_app/controllers/search.py Tue Aug 17 23:15:36 2010 +0200 @@ -0,0 +1,112 @@ +#!/usr/bin/env python +# encoding: utf-8 +# search controller for pylons +# Copyright (C) 2009-2010 Marcin Kuzminski +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; version 2 +# of the License or (at your opinion) any later version of the license. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. +""" +Created on Aug 7, 2010 +search controller for pylons +@author: marcink +""" +from pylons import request, response, session, tmpl_context as c, url +from pylons.controllers.util import abort, redirect +from pylons_app.lib.auth import LoginRequired +from pylons_app.lib.base import BaseController, render +from pylons_app.lib.indexers import ANALYZER, IDX_LOCATION, SCHEMA +from webhelpers.html.builder import escape +from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter, \ + ContextFragmenter +from whoosh.index import open_dir, EmptyIndexError +from whoosh.qparser import QueryParser, QueryParserError +from whoosh.query import Phrase +import logging +import traceback + +log = logging.getLogger(__name__) + +class SearchController(BaseController): + + @LoginRequired() + def __before__(self): + super(SearchController, self).__before__() + + + def index(self): + c.formated_results = [] + c.runtime = '' + search_items = set() + c.cur_query = request.GET.get('q', None) + if c.cur_query: + cur_query = c.cur_query.lower() + + + if c.cur_query: + try: + idx = open_dir(IDX_LOCATION, indexname='HG_INDEX') + searcher = idx.searcher() + + qp = QueryParser("content", schema=SCHEMA) + try: + query = qp.parse(unicode(cur_query)) + + if isinstance(query, Phrase): + search_items.update(query.words) + else: + for i in query.all_terms(): + search_items.add(i[1]) + + log.debug(query) + log.debug(search_items) + results = searcher.search(query) + c.runtime = '%s results (%.3f seconds)' \ + % (len(results), results.runtime) + + analyzer = ANALYZER + formatter = HtmlFormatter('span', + between='\n...\n') + + #how the parts are splitted within the same text part + fragmenter = SimpleFragmenter(200) + #fragmenter = ContextFragmenter(search_items) + + for res in results: + d = {} + d.update(res) + hl = highlight(escape(res['content']), search_items, + analyzer=analyzer, + fragmenter=fragmenter, + formatter=formatter, + top=5) + f_path = res['path'][res['path'].find(res['repository']) \ + + len(res['repository']):].lstrip('/') + d.update({'content_short':hl, + 'f_path':f_path}) + #del d['content'] + c.formated_results.append(d) + + except QueryParserError: + c.runtime = 'Invalid search query. Try quoting it.' + + except (EmptyIndexError, IOError): + log.error(traceback.format_exc()) + log.error('Empty Index data') + c.runtime = 'There is no index to search in. Please run whoosh indexer' + + + + # Return a rendered template + return render('/search/search.html') diff -r bec06654d67b -r b153a51b1d3b pylons_app/controllers/summary.py --- a/pylons_app/controllers/summary.py Tue Aug 17 22:29:17 2010 +0200 +++ b/pylons_app/controllers/summary.py Tue Aug 17 23:15:36 2010 +0200 @@ -113,7 +113,7 @@ % (author.decode('utf8'), [[x, aggregate[author][x]] for x in aggregate[author]])) if d == '': - d = '"%s":{label:"%s",data:[[0,0],]}' \ + d = '"%s":{label:"%s",data:[[0,1],]}' \ % (author_key_cleaner(repo.contact), author_key_cleaner(repo.contact)) return d diff -r bec06654d67b -r b153a51b1d3b pylons_app/lib/indexers/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylons_app/lib/indexers/__init__.py Tue Aug 17 23:15:36 2010 +0200 @@ -0,0 +1,36 @@ +import sys +import os +from pidlock import LockHeld, DaemonLock +import traceback + +from os.path import dirname as dn +from os.path import join as jn + +#to get the pylons_app import +sys.path.append(dn(dn(dn(os.path.realpath(__file__))))) + +from pylons_app.config.environment import load_environment +from pylons_app.model.hg_model import HgModel +from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter +from whoosh.fields import TEXT, ID, STORED, Schema +from whoosh.index import create_in, open_dir +from shutil import rmtree + +#LOCATION WE KEEP THE INDEX +IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index') + +#EXTENSION TO SKIP READING CONTENT ON +EXCLUDE_EXTENSIONS = ['pyc', 'mo', 'png', 'jpg', 'jpeg', 'gif', 'swf', + 'dll', 'ttf', 'psd', 'svg', 'pdf', 'bmp', 'dll'] + +#CUSTOM ANALYZER wordsplit + lowercase filter +ANALYZER = RegexTokenizer() | LowercaseFilter() + +#INDEX SCHEMA DEFINITION +SCHEMA = Schema(owner=TEXT(), + repository=TEXT(stored=True), + path=ID(stored=True, unique=True), + content=TEXT(stored=True, analyzer=ANALYZER), + modtime=STORED()) + +IDX_NAME = 'HG_INDEX' diff -r bec06654d67b -r b153a51b1d3b pylons_app/lib/indexers/daemon.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylons_app/lib/indexers/daemon.py Tue Aug 17 23:15:36 2010 +0200 @@ -0,0 +1,181 @@ +#!/usr/bin/env python +# encoding: utf-8 +# whoosh indexer daemon for hg-app +# Copyright (C) 2009-2010 Marcin Kuzminski +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; version 2 +# of the License or (at your opinion) any later version of the license. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. +""" +Created on Jan 26, 2010 + +@author: marcink +A deamon will read from task table and run tasks +""" +import sys +import os +from pidlock import LockHeld, DaemonLock +import traceback + +from os.path import dirname as dn +from os.path import join as jn + +#to get the pylons_app import +sys.path.append(dn(dn(dn(os.path.realpath(__file__))))) + +from pylons_app.config.environment import load_environment +from pylons_app.model.hg_model import HgModel +from whoosh.index import create_in, open_dir +from shutil import rmtree +from pylons_app.lib.indexers import ANALYZER, EXCLUDE_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME +import logging +log = logging.getLogger(__name__) + + +location = '/home/marcink/python_workspace_dirty/*' + +def scan_paths(root_location): + return HgModel.repo_scan('/', root_location, None, True) + +class WhooshIndexingDaemon(object): + """Deamon for atomic jobs""" + + def __init__(self, indexname='HG_INDEX'): + self.indexname = indexname + + + def get_paths(self, root_dir): + """recursive walk in root dir and return a set of all path in that dir + excluding files in .hg dir""" + index_paths_ = set() + for path, dirs, files in os.walk(root_dir): + if path.find('.hg') == -1: + for f in files: + index_paths_.add(jn(path, f)) + + return index_paths_ + + def add_doc(self, writer, path, repo): + """Adding doc to writer""" + + #we don't won't to read excluded file extensions just index them + if path.split('/')[-1].split('.')[-1].lower() not in EXCLUDE_EXTENSIONS: + fobj = open(path, 'rb') + content = fobj.read() + fobj.close() + try: + u_content = unicode(content) + except UnicodeDecodeError: + #incase we have a decode error just represent as byte string + u_content = unicode(str(content).encode('string_escape')) + else: + u_content = u'' + writer.add_document(owner=unicode(repo.contact), + repository=u"%s" % repo.name, + path=u"%s" % path, + content=u_content, + modtime=os.path.getmtime(path)) + + def build_index(self): + if os.path.exists(IDX_LOCATION): + rmtree(IDX_LOCATION) + + if not os.path.exists(IDX_LOCATION): + os.mkdir(IDX_LOCATION) + + idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME) + writer = idx.writer() + + for cnt, repo in enumerate(scan_paths(location).values()): + log.debug('building index @ %s' % repo.path) + + for idx_path in self.get_paths(repo.path): + log.debug(' >> %s' % idx_path) + self.add_doc(writer, idx_path, repo) + writer.commit(merge=True) + + log.debug('>>> FINISHED BUILDING INDEX <<<') + + + def update_index(self): + log.debug('STARTING INCREMENTAL INDEXING UPDATE') + + idx = open_dir(IDX_LOCATION, indexname=self.indexname) + # The set of all paths in the index + indexed_paths = set() + # The set of all paths we need to re-index + to_index = set() + + reader = idx.reader() + writer = idx.writer() + + # Loop over the stored fields in the index + for fields in reader.all_stored_fields(): + indexed_path = fields['path'] + indexed_paths.add(indexed_path) + + if not os.path.exists(indexed_path): + # This file was deleted since it was indexed + log.debug('removing from index %s' % indexed_path) + writer.delete_by_term('path', indexed_path) + + else: + # Check if this file was changed since it + # was indexed + indexed_time = fields['modtime'] + + mtime = os.path.getmtime(indexed_path) + + if mtime > indexed_time: + + # The file has changed, delete it and add it to the list of + # files to reindex + log.debug('adding to reindex list %s' % indexed_path) + writer.delete_by_term('path', indexed_path) + to_index.add(indexed_path) + #writer.commit() + + # Loop over the files in the filesystem + # Assume we have a function that gathers the filenames of the + # documents to be indexed + for repo in scan_paths(location).values(): + for path in self.get_paths(repo.path): + if path in to_index or path not in indexed_paths: + # This is either a file that's changed, or a new file + # that wasn't indexed before. So index it! + self.add_doc(writer, path, repo) + log.debug('reindexing %s' % path) + + writer.commit(merge=True) + #idx.optimize() + log.debug('>>> FINISHED <<<') + + def run(self, full_index=False): + """Run daemon""" + if full_index: + self.build_index() + else: + self.update_index() + +if __name__ == "__main__": + + #config = load_environment() + #print config + try: + l = DaemonLock() + WhooshIndexingDaemon().run(full_index=True) + l.release() + except LockHeld: + sys.exit(1) + diff -r bec06654d67b -r b153a51b1d3b pylons_app/lib/indexers/multiprocessing_indexer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylons_app/lib/indexers/multiprocessing_indexer.py Tue Aug 17 23:15:36 2010 +0200 @@ -0,0 +1,176 @@ +from multiprocessing import Process, Queue, cpu_count, Lock +import socket, sys +import time +import os +import sys +from os.path import dirname as dn +from multiprocessing.dummy import current_process +from shutil import rmtree + +sys.path.append(dn(dn(dn(os.path.realpath(__file__))))) + +from pylons_app.model.hg_model import HgModel +from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter +from whoosh.fields import TEXT, ID, STORED, Schema +from whoosh.index import create_in, open_dir +from datetime import datetime +from multiprocessing.process import current_process +from multiprocessing import Array, Value + +root = dn(dn(os.path.dirname(os.path.abspath(__file__)))) +idx_location = os.path.join(root, 'data', 'index') +root_path = '/home/marcink/python_workspace_dirty/*' + +exclude_extensions = ['pyc', 'mo', 'png', 'jpg', 'jpeg', 'gif', 'swf', + 'dll', 'ttf', 'psd', 'svg', 'pdf', 'bmp', 'dll'] + +my_analyzer = RegexTokenizer() | LowercaseFilter() +def scan_paths(root_location): + return HgModel.repo_scan('/', root_location, None, True) + +def index_paths(root_dir): + index_paths_ = set() + for path, dirs, files in os.walk(root_dir): + if path.find('.hg') == -1: + #if path.find('.hg') == -1 and path.find('bel-epa') != -1: + for f in files: + index_paths_.add(os.path.join(path, f)) + + return index_paths_ + +def get_schema(): + return Schema(owner=TEXT(), + repository=TEXT(stored=True), + path=ID(stored=True, unique=True), + content=TEXT(stored=True, analyzer=my_analyzer), + modtime=STORED()) + +def add_doc(writer, path, repo_name, contact): + """ + Adding doc to writer + @param writer: + @param path: + @param repo: + @param fname: + """ + + #we don't won't to read excluded file extensions just index them + if path.split('/')[-1].split('.')[-1].lower() not in exclude_extensions: + fobj = open(path, 'rb') + content = fobj.read() + fobj.close() + try: + u_content = unicode(content) + except UnicodeDecodeError: + #incase we have a decode error just represent as byte string + u_content = unicode(str(content).encode('string_escape')) + else: + u_content = u'' + writer.add_document(repository=u"%s" % repo_name, + owner=unicode(contact), + path=u"%s" % path, + content=u_content, + modtime=os.path.getmtime(path)) + + +class MultiProcessIndexer(object): + """ multiprocessing whoosh indexer """ + + def __init__(self, idx, work_set=set(), nr_processes=cpu_count()): + q = Queue() + l = Lock() + work_set = work_set + writer = None + #writer = idx.writer() + + for q_task in work_set: + q.put(q_task) + + q.put('COMMIT') + + #to stop all processes we have to put STOP to queue and + #break the loop for each process + for _ in xrange(nr_processes): + q.put('STOP') + + + for _ in xrange(nr_processes): + p = Process(target=self.work_func, args=(q, l, idx, writer)) + p.start() + + + + def work_func(self, q, l, idx, writer): + """ worker class invoked by process """ + + + writer = idx.writer() + + while True: + q_task = q.get() + proc = current_process() + +# if q_task == 'COMMIT': +# l.acquire() +# sys.stdout.write('%s commiting and STOP\n' % proc._name) +# writer.commit(merge=False) +# l.release() +# break +# l.acquire() +# writer = idx.writer() +# l.release() + + if q_task == 'STOP': + sys.stdout.write('%s STOP\n' % proc._name) + break + + if q_task != 'COMMIT': + l.acquire() + + sys.stdout.write(' >> %s %s %s @ ' % q_task) + sys.stdout.write(' %s \n' % proc._name) + + l.release() + add_doc(writer, q_task[0], q_task[1], q_task[2]) + + l.acquire() + writer.commit(merge=True) + l.release() + + +if __name__ == "__main__": + #build queue + do = True if len(sys.argv) > 1 else False + q_tasks = [] + + if os.path.exists(idx_location): + rmtree(idx_location) + + if not os.path.exists(idx_location): + os.mkdir(idx_location) + + idx = create_in(idx_location, get_schema() , indexname='HG_INDEX') + + + if do: + sys.stdout.write('Building queue...') + for cnt, repo in enumerate(scan_paths(root_path).values()): + if repo.name != 'evoice_py': + continue + q_tasks.extend([(idx_path, repo.name, repo.contact) for idx_path in index_paths(repo.path)]) + if cnt == 4: + break + + sys.stdout.write('done\n') + + mpi = MultiProcessIndexer(idx, q_tasks) + + + else: + print 'checking index' + reader = idx.reader() + all = reader.all_stored_fields() + #print all + for fields in all: + print fields['path'] + diff -r bec06654d67b -r b153a51b1d3b pylons_app/lib/indexers/pidlock.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylons_app/lib/indexers/pidlock.py Tue Aug 17 23:15:36 2010 +0200 @@ -0,0 +1,127 @@ +import os, time +import sys +from warnings import warn + +class LockHeld(Exception):pass + + +class DaemonLock(object): + '''daemon locking + USAGE: + try: + l = lock() + main() + l.release() + except LockHeld: + sys.exit(1) + ''' + + def __init__(self, file=None, callbackfn=None, + desc='daemon lock', debug=False): + + self.pidfile = file if file else os.path.join(os.path.dirname(__file__), + 'running.lock') + self.callbackfn = callbackfn + self.desc = desc + self.debug = debug + self.held = False + #run the lock automatically ! + self.lock() + + def __del__(self): + if self.held: + +# warn("use lock.release instead of del lock", +# category = DeprecationWarning, +# stacklevel = 2) + + # ensure the lock will be removed + self.release() + + + def lock(self): + ''' + locking function, if lock is present it will raise LockHeld exception + ''' + lockname = '%s' % (os.getpid()) + + self.trylock() + self.makelock(lockname, self.pidfile) + return True + + def trylock(self): + running_pid = False + try: + pidfile = open(self.pidfile, "r") + pidfile.seek(0) + running_pid = pidfile.readline() + if self.debug: + print 'lock file present running_pid: %s, checking for execution'\ + % running_pid + # Now we check the PID from lock file matches to the current + # process PID + if running_pid: + if os.path.exists("/proc/%s" % running_pid): + print "You already have an instance of the program running" + print "It is running as process %s" % running_pid + raise LockHeld + else: + print "Lock File is there but the program is not running" + print "Removing lock file for the: %s" % running_pid + self.release() + except IOError, e: + if e.errno != 2: + raise + + + def release(self): + ''' + releases the pid by removing the pidfile + ''' + if self.callbackfn: + #execute callback function on release + if self.debug: + print 'executing callback function %s' % self.callbackfn + self.callbackfn() + try: + if self.debug: + print 'removing pidfile %s' % self.pidfile + os.remove(self.pidfile) + self.held = False + except OSError, e: + if self.debug: + print 'removing pidfile failed %s' % e + pass + + def makelock(self, lockname, pidfile): + ''' + this function will make an actual lock + @param lockname: acctual pid of file + @param pidfile: the file to write the pid in + ''' + if self.debug: + print 'creating a file %s and pid: %s' % (pidfile, lockname) + pidfile = open(self.pidfile, "wb") + pidfile.write(lockname) + pidfile.close + self.held = True + + +def main(): + print 'func is running' + cnt = 20 + while 1: + print cnt + if cnt == 0: + break + time.sleep(1) + cnt -= 1 + + +if __name__ == "__main__": + try: + l = DaemonLock(desc='test lock') + main() + l.release() + except LockHeld: + sys.exit(1) diff -r bec06654d67b -r b153a51b1d3b pylons_app/templates/search/search.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylons_app/templates/search/search.html Tue Aug 17 23:15:36 2010 +0200 @@ -0,0 +1,69 @@ +## -*- coding: utf-8 -*- +<%inherit file="/base/base.html"/> +<%def name="title()"> + ${_('Search')}: ${c.cur_query} + +<%def name="breadcrumbs()"> + ${c.hg_app_name} + +<%def name="page_nav()"> + ${self.menu('home')} + +<%def name="main()"> + +

+ +

${_('Search')}

+ + ${h.form('search',method='get')} +

+ +

+ ${_('Search:')} +

+ ${h.text('q',c.cur_query,class_="small")} +

+ +

${c.runtime}

+ ${h.end_form()} + + %for cnt,sr in enumerate(c.formated_results): + %if h.HasRepoPermissionAny('repository.write','repository.read','repository.admin')(sr['repository'],'search results check'): +

+				
+					${h.link_to(h.literal('%s » %s' % (sr['repository'],sr['f_path'])),
+					h.url('files_home',repo_name=sr['repository'],revision='tip',f_path=sr['f_path']))}

+				

+				
+					${h.literal(sr['content_short'])}
+				

+			

+ %else: + %if cnt == 0: +

+					${_('Permission denied')}

+				

+ %endif + + %endif + %endfor + + + +

+ + diff -r bec06654d67b -r b153a51b1d3b pylons_app/tests/functional/test_search.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylons_app/tests/functional/test_search.py Tue Aug 17 23:15:36 2010 +0200 @@ -0,0 +1,7 @@ +from pylons_app.tests import * + +class TestSearchController(TestController): + + def test_index(self): + response = self.app.get(url(controller='search', action='index')) + # Test response... diff -r bec06654d67b -r b153a51b1d3b setup.py --- a/setup.py Tue Aug 17 22:29:17 2010 +0200 +++ b/setup.py Tue Aug 17 23:15:36 2010 +0200 @@ -7,7 +7,7 @@ from setuptools import setup, find_packages setup( - name='pylons_app', + name='hg_app', version=get_version(), description='Mercurial repository serving and browsing app', keywords='mercurial web hgwebdir replacement serving hgweb', @@ -22,7 +22,8 @@ "vcs>=0.1.4", "pygments>=1.3.0", "mercurial>=1.6", - "pysqlite" + "pysqlite", + "whoosh==1.0.0b5", ], setup_requires=["PasteScript>=1.6.3"], packages=find_packages(exclude=['ez_setup']),