comparison rhodecode/lib/indexers/daemon.py @ 560:3072935bdeed

rewrote whoosh indexing to run internal repository.walk() instead of filesystem. Disabled default hg update hook (not needed since whoosh is not dependent on file system files to index)
author Marcin Kuzminski <marcin@python-works.com>
date Sat, 09 Oct 2010 00:22:19 +0200
parents 29ec9ddbe258
children 5f3b967d9d10
comparison
equal deleted inserted replaced
559:bc4633a41967 560:3072935bdeed
37 from rhodecode.lib.helpers import safe_unicode 37 from rhodecode.lib.helpers import safe_unicode
38 from whoosh.index import create_in, open_dir 38 from whoosh.index import create_in, open_dir
39 from shutil import rmtree 39 from shutil import rmtree
40 from rhodecode.lib.indexers import INDEX_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME 40 from rhodecode.lib.indexers import INDEX_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME
41 41
42 from time import mktime
43 from vcs.backends import hg
44
42 import logging 45 import logging
43 46
44 log = logging.getLogger('whooshIndexer') 47 log = logging.getLogger('whooshIndexer')
45 # create logger 48 # create logger
46 log.setLevel(logging.DEBUG) 49 log.setLevel(logging.DEBUG)
60 63
61 def scan_paths(root_location): 64 def scan_paths(root_location):
62 return HgModel.repo_scan('/', root_location, None, True) 65 return HgModel.repo_scan('/', root_location, None, True)
63 66
64 class WhooshIndexingDaemon(object): 67 class WhooshIndexingDaemon(object):
65 """Deamon for atomic jobs""" 68 """
69 Deamon for atomic jobs
70 """
66 71
67 def __init__(self, indexname='HG_INDEX', repo_location=None): 72 def __init__(self, indexname='HG_INDEX', repo_location=None):
68 self.indexname = indexname 73 self.indexname = indexname
69 self.repo_location = repo_location 74 self.repo_location = repo_location
70 self.initial = False 75 self.initial = False
71 if not os.path.isdir(IDX_LOCATION): 76 if not os.path.isdir(IDX_LOCATION):
72 os.mkdir(IDX_LOCATION) 77 os.mkdir(IDX_LOCATION)
73 log.info('Cannot run incremental index since it does not' 78 log.info('Cannot run incremental index since it does not'
74 ' yet exist running full build') 79 ' yet exist running full build')
75 self.initial = True 80 self.initial = True
76 81
77 def get_paths(self, root_dir): 82 def get_paths(self, root_dir):
78 """recursive walk in root dir and return a set of all path in that dir 83 """
79 excluding files in .hg dir""" 84 recursive walk in root dir and return a set of all path in that dir
85 based on repository walk function
86 """
87 repo = hg.MercurialRepository(root_dir)
80 index_paths_ = set() 88 index_paths_ = set()
81 for path, dirs, files in os.walk(root_dir): 89 for topnode, dirs, files in repo.walk('/', 'tip'):
82 if path.find('.hg') == -1: 90 for f in files:
91 index_paths_.add(jn(root_dir, f.path))
92 for dir in dirs:
83 for f in files: 93 for f in files:
84 index_paths_.add(jn(path, f)) 94 index_paths_.add(jn(root_dir, f.path))
85 95
86 return index_paths_ 96 return index_paths_
87 97
98
88 def add_doc(self, writer, path, repo): 99 def add_doc(self, writer, path, repo):
89 """Adding doc to writer""" 100 """Adding doc to writer"""
90 101 n_path = path[len(repo.path) + 1:]
91 ext = unicode(path.split('/')[-1].split('.')[-1].lower()) 102 node = repo.get_changeset().get_node(n_path)
92 #we just index the content of choosen files 103
93 if ext in INDEX_EXTENSIONS: 104 #we just index the content of chosen files
105 if node.extension in INDEX_EXTENSIONS:
94 log.debug(' >> %s [WITH CONTENT]' % path) 106 log.debug(' >> %s [WITH CONTENT]' % path)
95 fobj = open(path, 'rb') 107 u_content = node.content
96 content = fobj.read()
97 fobj.close()
98 u_content = safe_unicode(content)
99 else: 108 else:
100 log.debug(' >> %s' % path) 109 log.debug(' >> %s' % path)
101 #just index file name without it's content 110 #just index file name without it's content
102 u_content = u'' 111 u_content = u''
103 112
104 113 writer.add_document(owner=unicode(repo.contact),
105 114 repository=safe_unicode(repo.name),
106 try: 115 path=safe_unicode(path),
107 os.stat(path) 116 content=u_content,
108 writer.add_document(owner=unicode(repo.contact), 117 modtime=mktime(node.last_changeset.date.timetuple()),
109 repository=safe_unicode(repo.name), 118 extension=node.extension)
110 path=safe_unicode(path),
111 content=u_content,
112 modtime=os.path.getmtime(path),
113 extension=ext)
114 except OSError, e:
115 import errno
116 if e.errno == errno.ENOENT:
117 log.debug('path %s does not exist or is a broken symlink' % path)
118 else:
119 raise e
120 119
121 120
122 def build_index(self): 121 def build_index(self):
123 if os.path.exists(IDX_LOCATION): 122 if os.path.exists(IDX_LOCATION):
124 log.debug('removing previos index') 123 log.debug('removing previous index')
125 rmtree(IDX_LOCATION) 124 rmtree(IDX_LOCATION)
126 125
127 if not os.path.exists(IDX_LOCATION): 126 if not os.path.exists(IDX_LOCATION):
128 os.mkdir(IDX_LOCATION) 127 os.mkdir(IDX_LOCATION)
129 128