view pylons_app/lib/indexers/__init__.py @ 436:28f19fa562df

updated config files, Implemented content index extensions with whoosh, fixed analyzer to match more words
author Marcin Kuzminski <marcin@python-works.com>
date Sat, 28 Aug 2010 14:53:32 +0200
parents b153a51b1d3b
children 7010af6efde5
line wrap: on
line source

import sys
import os
from pidlock import LockHeld, DaemonLock
import traceback

from os.path import dirname as dn
from os.path import join as jn

#to get the pylons_app import
sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))

from pylons_app.config.environment import load_environment
from pylons_app.model.hg_model import HgModel
from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
from whoosh.fields import TEXT, ID, STORED, Schema
from whoosh.index import create_in, open_dir
from shutil import rmtree

#LOCATION WE KEEP THE INDEX
IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')

#EXTENSIONS WE WANT TO INDEX CONTENT OFF
INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c', 
                    'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', 'h', 
                    'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp', 
                    'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3', 
                    'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql', 
                    'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml','xsl','xslt', 
                    'yaws']

#CUSTOM ANALYZER wordsplit + lowercase filter
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()

#INDEX SCHEMA DEFINITION
SCHEMA = Schema(owner=TEXT(),
                repository=TEXT(stored=True),
                path=ID(stored=True, unique=True),
                content=TEXT(stored=True, analyzer=ANALYZER),
                modtime=STORED(),extension=TEXT(stored=True))

IDX_NAME = 'HG_INDEX'