Mercurial > kallithea
comparison rhodecode/lib/indexers/daemon.py @ 560:3072935bdeed
rewrote whoosh indexing to run internal repository.walk() instead of filesystem.
Disabled default hg update hook (not needed since whoosh is not dependent on file system files to index)
author | Marcin Kuzminski <marcin@python-works.com> |
---|---|
date | Sat, 09 Oct 2010 00:22:19 +0200 |
parents | 29ec9ddbe258 |
children | 5f3b967d9d10 |
comparison
equal
deleted
inserted
replaced
559:bc4633a41967 | 560:3072935bdeed |
---|---|
37 from rhodecode.lib.helpers import safe_unicode | 37 from rhodecode.lib.helpers import safe_unicode |
38 from whoosh.index import create_in, open_dir | 38 from whoosh.index import create_in, open_dir |
39 from shutil import rmtree | 39 from shutil import rmtree |
40 from rhodecode.lib.indexers import INDEX_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME | 40 from rhodecode.lib.indexers import INDEX_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME |
41 | 41 |
42 from time import mktime | |
43 from vcs.backends import hg | |
44 | |
42 import logging | 45 import logging |
43 | 46 |
44 log = logging.getLogger('whooshIndexer') | 47 log = logging.getLogger('whooshIndexer') |
45 # create logger | 48 # create logger |
46 log.setLevel(logging.DEBUG) | 49 log.setLevel(logging.DEBUG) |
60 | 63 |
61 def scan_paths(root_location): | 64 def scan_paths(root_location): |
62 return HgModel.repo_scan('/', root_location, None, True) | 65 return HgModel.repo_scan('/', root_location, None, True) |
63 | 66 |
64 class WhooshIndexingDaemon(object): | 67 class WhooshIndexingDaemon(object): |
65 """Deamon for atomic jobs""" | 68 """ |
69 Deamon for atomic jobs | |
70 """ | |
66 | 71 |
67 def __init__(self, indexname='HG_INDEX', repo_location=None): | 72 def __init__(self, indexname='HG_INDEX', repo_location=None): |
68 self.indexname = indexname | 73 self.indexname = indexname |
69 self.repo_location = repo_location | 74 self.repo_location = repo_location |
70 self.initial = False | 75 self.initial = False |
71 if not os.path.isdir(IDX_LOCATION): | 76 if not os.path.isdir(IDX_LOCATION): |
72 os.mkdir(IDX_LOCATION) | 77 os.mkdir(IDX_LOCATION) |
73 log.info('Cannot run incremental index since it does not' | 78 log.info('Cannot run incremental index since it does not' |
74 ' yet exist running full build') | 79 ' yet exist running full build') |
75 self.initial = True | 80 self.initial = True |
76 | 81 |
77 def get_paths(self, root_dir): | 82 def get_paths(self, root_dir): |
78 """recursive walk in root dir and return a set of all path in that dir | 83 """ |
79 excluding files in .hg dir""" | 84 recursive walk in root dir and return a set of all path in that dir |
85 based on repository walk function | |
86 """ | |
87 repo = hg.MercurialRepository(root_dir) | |
80 index_paths_ = set() | 88 index_paths_ = set() |
81 for path, dirs, files in os.walk(root_dir): | 89 for topnode, dirs, files in repo.walk('/', 'tip'): |
82 if path.find('.hg') == -1: | 90 for f in files: |
91 index_paths_.add(jn(root_dir, f.path)) | |
92 for dir in dirs: | |
83 for f in files: | 93 for f in files: |
84 index_paths_.add(jn(path, f)) | 94 index_paths_.add(jn(root_dir, f.path)) |
85 | 95 |
86 return index_paths_ | 96 return index_paths_ |
87 | 97 |
98 | |
88 def add_doc(self, writer, path, repo): | 99 def add_doc(self, writer, path, repo): |
89 """Adding doc to writer""" | 100 """Adding doc to writer""" |
90 | 101 n_path = path[len(repo.path) + 1:] |
91 ext = unicode(path.split('/')[-1].split('.')[-1].lower()) | 102 node = repo.get_changeset().get_node(n_path) |
92 #we just index the content of choosen files | 103 |
93 if ext in INDEX_EXTENSIONS: | 104 #we just index the content of chosen files |
105 if node.extension in INDEX_EXTENSIONS: | |
94 log.debug(' >> %s [WITH CONTENT]' % path) | 106 log.debug(' >> %s [WITH CONTENT]' % path) |
95 fobj = open(path, 'rb') | 107 u_content = node.content |
96 content = fobj.read() | |
97 fobj.close() | |
98 u_content = safe_unicode(content) | |
99 else: | 108 else: |
100 log.debug(' >> %s' % path) | 109 log.debug(' >> %s' % path) |
101 #just index file name without it's content | 110 #just index file name without it's content |
102 u_content = u'' | 111 u_content = u'' |
103 | 112 |
104 | 113 writer.add_document(owner=unicode(repo.contact), |
105 | 114 repository=safe_unicode(repo.name), |
106 try: | 115 path=safe_unicode(path), |
107 os.stat(path) | 116 content=u_content, |
108 writer.add_document(owner=unicode(repo.contact), | 117 modtime=mktime(node.last_changeset.date.timetuple()), |
109 repository=safe_unicode(repo.name), | 118 extension=node.extension) |
110 path=safe_unicode(path), | |
111 content=u_content, | |
112 modtime=os.path.getmtime(path), | |
113 extension=ext) | |
114 except OSError, e: | |
115 import errno | |
116 if e.errno == errno.ENOENT: | |
117 log.debug('path %s does not exist or is a broken symlink' % path) | |
118 else: | |
119 raise e | |
120 | 119 |
121 | 120 |
122 def build_index(self): | 121 def build_index(self): |
123 if os.path.exists(IDX_LOCATION): | 122 if os.path.exists(IDX_LOCATION): |
124 log.debug('removing previos index') | 123 log.debug('removing previous index') |
125 rmtree(IDX_LOCATION) | 124 rmtree(IDX_LOCATION) |
126 | 125 |
127 if not os.path.exists(IDX_LOCATION): | 126 if not os.path.exists(IDX_LOCATION): |
128 os.mkdir(IDX_LOCATION) | 127 os.mkdir(IDX_LOCATION) |
129 | 128 |