changeset 478:7010af6efde5 celery

Reimplemented searching for speed on large files and added paging for search results Updated setup requirements
author Marcin Kuzminski <marcin@python-works.com>
date Thu, 16 Sep 2010 02:59:47 +0200
parents fdebc5f67dc6
children 149940ba96d9
files pylons_app/controllers/search.py pylons_app/lib/indexers/__init__.py pylons_app/templates/search/search.html setup.py
diffstat 4 files changed, 155 insertions(+), 69 deletions(-) [+]
line wrap: on
line diff
--- a/pylons_app/controllers/search.py	Tue Sep 14 17:34:15 2010 +0200
+++ b/pylons_app/controllers/search.py	Thu Sep 16 02:59:47 2010 +0200
@@ -26,10 +26,9 @@
 from pylons.controllers.util import abort, redirect
 from pylons_app.lib.auth import LoginRequired
 from pylons_app.lib.base import BaseController, render
-from pylons_app.lib.indexers import ANALYZER, IDX_LOCATION, SCHEMA, IDX_NAME
-from webhelpers.html.builder import escape
-from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter, \
-    ContextFragmenter
+from pylons_app.lib.indexers import IDX_LOCATION, SCHEMA, IDX_NAME, ResultWrapper
+from webhelpers.paginate import Page
+from webhelpers.util import update_params
 from pylons.i18n.translation import _
 from whoosh.index import open_dir, EmptyIndexError
 from whoosh.qparser import QueryParser, QueryParserError
@@ -45,69 +44,55 @@
     def __before__(self):
         super(SearchController, self).__before__()    
 
-
     def index(self):
         c.formated_results = []
         c.runtime = ''
-        search_items = set()
         c.cur_query = request.GET.get('q', None)
         if c.cur_query:
             cur_query = c.cur_query.lower()
         
-        
         if c.cur_query:
+            p = int(request.params.get('page', 1))
+            highlight_items = set()
             try:
                 idx = open_dir(IDX_LOCATION, indexname=IDX_NAME)
                 searcher = idx.searcher()
-            
+
                 qp = QueryParser("content", schema=SCHEMA)
                 try:
                     query = qp.parse(unicode(cur_query))
                     
                     if isinstance(query, Phrase):
-                        search_items.update(query.words)
+                        highlight_items.update(query.words)
                     else:
                         for i in query.all_terms():
-                            search_items.add(i[1])
-                        
-                    log.debug(query)
-                    log.debug(search_items)
-                    results = searcher.search(query)
-                    c.runtime = '%s results (%.3f seconds)' \
-                    % (len(results), results.runtime)
+                            if i[0] == 'content':
+                                highlight_items.add(i[1])
 
-                    analyzer = ANALYZER
-                    formatter = HtmlFormatter('span',
-                        between='\n<span class="break">...</span>\n') 
-                    
-                    #how the parts are splitted within the same text part
-                    fragmenter = SimpleFragmenter(200)
-                    #fragmenter = ContextFragmenter(search_items)
+                    matcher = query.matcher(searcher)
                     
-                    for res in results:
-                        d = {}
-                        d.update(res)
-                        hl = highlight(escape(res['content']), search_items,
-                                                         analyzer=analyzer,
-                                                         fragmenter=fragmenter,
-                                                         formatter=formatter,
-                                                         top=5)
-                        f_path = res['path'][res['path'].find(res['repository']) \
-                                             + len(res['repository']):].lstrip('/')
-                        d.update({'content_short':hl,
-                                  'f_path':f_path})
-                        #del d['content']
-                        c.formated_results.append(d)
-                                                    
+                    log.debug(query)
+                    log.debug(highlight_items)
+                    results = searcher.search(query)
+                    res_ln = len(results)
+                    c.runtime = '%s results (%.3f seconds)' \
+                    % (res_ln, results.runtime)
+                    
+                    def url_generator(**kw):
+                        return update_params("?q=%s" % c.cur_query, **kw)
+
+                    c.formated_results = Page(
+                                ResultWrapper(searcher, matcher, highlight_items),
+                                page=p, item_count=res_ln,
+                                items_per_page=10, url=url_generator)
+                           
                 except QueryParserError:
                     c.runtime = _('Invalid search query. Try quoting it.')
-
+                searcher.close()
             except (EmptyIndexError, IOError):
                 log.error(traceback.format_exc())
                 log.error('Empty Index data')
                 c.runtime = _('There is no index to search in. Please run whoosh indexer')
-            
-
-                
+                        
         # Return a rendered template
         return render('/search/search.html')
--- a/pylons_app/lib/indexers/__init__.py	Tue Sep 14 17:34:15 2010 +0200
+++ b/pylons_app/lib/indexers/__init__.py	Thu Sep 16 02:59:47 2010 +0200
@@ -1,41 +1,140 @@
-import sys
+from os.path import dirname as dn, join as jn
+from pidlock import LockHeld, DaemonLock
+from pylons_app.config.environment import load_environment
+from pylons_app.model.hg_model import HgModel
+from shutil import rmtree
+from webhelpers.html.builder import escape
+from vcs.utils.lazy import LazyProperty
+
+from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
+from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
+from whoosh.index import create_in, open_dir
+from whoosh.formats import Characters
+from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter   
+
 import os
-from pidlock import LockHeld, DaemonLock
+import sys
 import traceback
 
-from os.path import dirname as dn
-from os.path import join as jn
+
 
 #to get the pylons_app import
 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
 
-from pylons_app.config.environment import load_environment
-from pylons_app.model.hg_model import HgModel
-from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
-from whoosh.fields import TEXT, ID, STORED, Schema
-from whoosh.index import create_in, open_dir
-from shutil import rmtree
 
 #LOCATION WE KEEP THE INDEX
 IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')
 
 #EXTENSIONS WE WANT TO INDEX CONTENT OFF
-INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c', 
-                    'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', 'h', 
-                    'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp', 
-                    'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3', 
-                    'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql', 
-                    'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml','xsl','xslt', 
+INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
+                    'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',
+                    'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
+                    'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
+                    'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
+                    'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',
                     'yaws']
 
 #CUSTOM ANALYZER wordsplit + lowercase filter
 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
 
+
 #INDEX SCHEMA DEFINITION
 SCHEMA = Schema(owner=TEXT(),
                 repository=TEXT(stored=True),
                 path=ID(stored=True, unique=True),
-                content=TEXT(stored=True, analyzer=ANALYZER),
-                modtime=STORED(),extension=TEXT(stored=True))
+                content=FieldType(format=Characters(ANALYZER),
+                             scorable=True, stored=True),
+                modtime=STORED(), extension=TEXT(stored=True))
+
+
+IDX_NAME = 'HG_INDEX'
+FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') 
+FRAGMENTER = SimpleFragmenter(200)
+                 
+                    
+
+                            
+class ResultWrapper(object):
+    def __init__(self, searcher, matcher, highlight_items):
+        self.searcher = searcher
+        self.matcher = matcher
+        self.highlight_items = highlight_items
+        self.fragment_size = 150 * 2
+    
+    @LazyProperty
+    def doc_ids(self):
+        docs_id = []
+        while self.matcher.is_active():
+            docnum = self.matcher.id()
+            docs_id.append(docnum)
+            self.matcher.next()
+        return docs_id   
+        
+    def __str__(self):
+        return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
+
+    def __repr__(self):
+        return self.__str__()
+
+    def __len__(self):
+        return len(self.doc_ids)
+
+    def __iter__(self):
+        """
+        Allows Iteration over results,and lazy generate content
+
+        *Requires* implementation of ``__getitem__`` method.
+        """
+        for docid in self.doc_ids:
+            yield self.get_full_content(docid)
 
-IDX_NAME = 'HG_INDEX'
\ No newline at end of file
+    def __getslice__(self, i, j):
+        """
+        Slicing of resultWrapper
+        """
+        slice = []
+        for docid in self.doc_ids[i:j]:
+            slice.append(self.get_full_content(docid))
+        return slice   
+                            
+
+    def get_full_content(self, docid):
+        res = self.searcher.stored_fields(docid)
+        f_path = res['path'][res['path'].find(res['repository']) \
+                             + len(res['repository']):].lstrip('/')
+        
+        content_short = ''.join(self.get_short_content(res))
+        res.update({'content_short':content_short,
+                    'content_short_hl':self.highlight(content_short),
+                    'f_path':f_path})
+        
+        return res        
+
+    def get_short_content(self, res):
+        """
+        Smart function that implements chunking the content
+        but not overlap chunks so it doesn't highlight the same
+        close occurences twice.
+        @param matcher:
+        @param size:
+        """
+        memory = [(0, 0)]
+        for span in self.matcher.spans():
+            start = span.startchar or 0
+            end = span.endchar or 0
+            start_offseted = max(0, start - self.fragment_size)
+            end_offseted = end + self.fragment_size
+            print start_offseted, end_offseted
+            if start_offseted < memory[-1][1]:
+                start_offseted = memory[-1][1]
+            memory.append((start_offseted, end_offseted,))    
+            yield res["content"][start_offseted:end_offseted]  
+        
+    def highlight(self, content, top=5):
+        hl = highlight(escape(content),
+                 self.highlight_items,
+                 analyzer=ANALYZER,
+                 fragmenter=FRAGMENTER,
+                 formatter=FORMATTER,
+                 top=top)
+        return hl 
--- a/pylons_app/templates/search/search.html	Tue Sep 14 17:34:15 2010 +0200
+++ b/pylons_app/templates/search/search.html	Thu Sep 16 02:59:47 2010 +0200
@@ -46,7 +46,7 @@
 					h.url('files_home',repo_name=sr['repository'],revision='tip',f_path=sr['f_path']))}</div>
 				</div>
 				<div class="code-body">
-					<pre>${h.literal(sr['content_short'])}</pre>
+					<pre>${h.literal(sr['content_short_hl'])}</pre>
 				</div>
 			</div>
 		</div>
@@ -59,11 +59,13 @@
 			</div>		
 			%endif
 			
-		%endif
+		%endif		
 	%endfor
-
-	
-	
+	%if c.cur_query:
+	<div class="pagination-wh pagination-left">
+		${c.formated_results.pager('$link_previous ~2~ $link_next')}
+	</div>	
+	%endif
 </div>
 
 </%def>    
--- a/setup.py	Tue Sep 14 17:34:15 2010 +0200
+++ b/setup.py	Thu Sep 16 02:59:47 2010 +0200
@@ -7,7 +7,7 @@
     from setuptools import setup, find_packages
 
 setup(
-    name='HgApp-%s'%get_version(),
+    name='HgApp-%s' % get_version(),
     version=get_version(),
     description='Mercurial repository serving and browsing app',
     keywords='mercurial web hgwebdir replacement serving hgweb',
@@ -20,11 +20,11 @@
         "SQLAlchemy>=0.6",
         "babel",
         "Mako>=0.3.2",
-        "vcs>=0.1.4",
+        "vcs>=0.1.5",
         "pygments>=1.3.0",
         "mercurial>=1.6",
         "pysqlite",
-        "whoosh==1.0.0b10",
+        "whoosh==1.0.0b16",
         "py-bcrypt",
         "celery",
     ],