Mercurial > kallithea
comparison rhodecode/lib/indexers/daemon.py @ 2165:dc2584ba5fbc
merged beta into default branch
author | Marcin Kuzminski <marcin@python-works.com> |
---|---|
date | Wed, 28 Mar 2012 19:54:16 +0200 |
parents | 82a88013a3fd 8ecfed1d8f8b |
children | 63e58ef80ef1 |
comparison
equal
deleted
inserted
replaced
2097:8fd6650bb436 | 2165:dc2584ba5fbc |
---|---|
36 | 36 |
37 #to get the rhodecode import | 37 #to get the rhodecode import |
38 project_path = dn(dn(dn(dn(os.path.realpath(__file__))))) | 38 project_path = dn(dn(dn(dn(os.path.realpath(__file__))))) |
39 sys.path.append(project_path) | 39 sys.path.append(project_path) |
40 | 40 |
41 | 41 from rhodecode.config.conf import INDEX_EXTENSIONS |
42 from rhodecode.model.scm import ScmModel | 42 from rhodecode.model.scm import ScmModel |
43 from rhodecode.lib import safe_unicode | 43 from rhodecode.lib.utils2 import safe_unicode |
44 from rhodecode.lib.indexers import INDEX_EXTENSIONS, SCHEMA, IDX_NAME | 44 from rhodecode.lib.indexers import SCHEMA, IDX_NAME |
45 | 45 |
46 from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \ | 46 from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \ |
47 NodeDoesNotExistError | 47 NodeDoesNotExistError |
48 | 48 |
49 from whoosh.index import create_in, open_dir | 49 from whoosh.index import create_in, open_dir |
50 | 50 |
51 | 51 log = logging.getLogger('whoosh_indexer') |
52 log = logging.getLogger('whooshIndexer') | |
53 # create logger | |
54 log.setLevel(logging.DEBUG) | |
55 log.propagate = False | |
56 # create console handler and set level to debug | |
57 ch = logging.StreamHandler() | |
58 ch.setLevel(logging.DEBUG) | |
59 | |
60 # create formatter | |
61 formatter = logging.Formatter("%(asctime)s - %(name)s -" | |
62 " %(levelname)s - %(message)s") | |
63 | |
64 # add formatter to ch | |
65 ch.setFormatter(formatter) | |
66 | |
67 # add ch to logger | |
68 log.addHandler(ch) | |
69 | 52 |
70 | 53 |
71 class WhooshIndexingDaemon(object): | 54 class WhooshIndexingDaemon(object): |
72 """ | 55 """ |
73 Daemon for atomic jobs | 56 Daemon for atomic jobs |
101 log.info('Cannot run incremental index since it does not' | 84 log.info('Cannot run incremental index since it does not' |
102 ' yet exist running full build') | 85 ' yet exist running full build') |
103 self.initial = True | 86 self.initial = True |
104 | 87 |
105 def get_paths(self, repo): | 88 def get_paths(self, repo): |
106 """recursive walk in root dir and return a set of all path in that dir | 89 """ |
90 recursive walk in root dir and return a set of all path in that dir | |
107 based on repository walk function | 91 based on repository walk function |
108 """ | 92 """ |
109 index_paths_ = set() | 93 index_paths_ = set() |
110 try: | 94 try: |
111 tip = repo.get_changeset('tip') | 95 tip = repo.get_changeset('tip') |
125 | 109 |
126 def get_node_mtime(self, node): | 110 def get_node_mtime(self, node): |
127 return mktime(node.last_changeset.date.timetuple()) | 111 return mktime(node.last_changeset.date.timetuple()) |
128 | 112 |
129 def add_doc(self, writer, path, repo, repo_name): | 113 def add_doc(self, writer, path, repo, repo_name): |
130 """Adding doc to writer this function itself fetches data from | 114 """ |
131 the instance of vcs backend""" | 115 Adding doc to writer this function itself fetches data from |
116 the instance of vcs backend | |
117 """ | |
118 | |
132 node = self.get_node(repo, path) | 119 node = self.get_node(repo, path) |
133 | 120 indexed = indexed_w_content = 0 |
134 #we just index the content of chosen files, and skip binary files | 121 # we just index the content of chosen files, and skip binary files |
135 if node.extension in INDEX_EXTENSIONS and not node.is_binary: | 122 if node.extension in INDEX_EXTENSIONS and not node.is_binary: |
136 | |
137 u_content = node.content | 123 u_content = node.content |
138 if not isinstance(u_content, unicode): | 124 if not isinstance(u_content, unicode): |
139 log.warning(' >> %s Could not get this content as unicode ' | 125 log.warning(' >> %s Could not get this content as unicode ' |
140 'replacing with empty content', path) | 126 'replacing with empty content' % path) |
141 u_content = u'' | 127 u_content = u'' |
142 else: | 128 else: |
143 log.debug(' >> %s [WITH CONTENT]' % path) | 129 log.debug(' >> %s [WITH CONTENT]' % path) |
130 indexed_w_content += 1 | |
144 | 131 |
145 else: | 132 else: |
146 log.debug(' >> %s' % path) | 133 log.debug(' >> %s' % path) |
147 #just index file name without it's content | 134 # just index file name without it's content |
148 u_content = u'' | 135 u_content = u'' |
149 | 136 indexed += 1 |
150 writer.add_document(owner=unicode(repo.contact), | 137 |
151 repository=safe_unicode(repo_name), | 138 writer.add_document( |
152 path=safe_unicode(path), | 139 owner=unicode(repo.contact), |
153 content=u_content, | 140 repository=safe_unicode(repo_name), |
154 modtime=self.get_node_mtime(node), | 141 path=safe_unicode(path), |
155 extension=node.extension) | 142 content=u_content, |
143 modtime=self.get_node_mtime(node), | |
144 extension=node.extension | |
145 ) | |
146 return indexed, indexed_w_content | |
156 | 147 |
157 def build_index(self): | 148 def build_index(self): |
158 if os.path.exists(self.index_location): | 149 if os.path.exists(self.index_location): |
159 log.debug('removing previous index') | 150 log.debug('removing previous index') |
160 rmtree(self.index_location) | 151 rmtree(self.index_location) |
162 if not os.path.exists(self.index_location): | 153 if not os.path.exists(self.index_location): |
163 os.mkdir(self.index_location) | 154 os.mkdir(self.index_location) |
164 | 155 |
165 idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME) | 156 idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME) |
166 writer = idx.writer() | 157 writer = idx.writer() |
167 | 158 log.debug('BUILDIN INDEX FOR EXTENSIONS %s' % INDEX_EXTENSIONS) |
168 for repo_name, repo in self.repo_paths.items(): | 159 for repo_name, repo in self.repo_paths.items(): |
169 log.debug('building index @ %s' % repo.path) | 160 log.debug('building index @ %s' % repo.path) |
170 | 161 i_cnt = iwc_cnt = 0 |
171 for idx_path in self.get_paths(repo): | 162 for idx_path in self.get_paths(repo): |
172 self.add_doc(writer, idx_path, repo, repo_name) | 163 i, iwc = self.add_doc(writer, idx_path, repo, repo_name) |
164 i_cnt += i | |
165 iwc_cnt += iwc | |
166 log.debug('added %s files %s with content for repo %s' % ( | |
167 i_cnt + iwc_cnt, iwc_cnt, repo.path) | |
168 ) | |
173 | 169 |
174 log.debug('>> COMMITING CHANGES <<') | 170 log.debug('>> COMMITING CHANGES <<') |
175 writer.commit(merge=True) | 171 writer.commit(merge=True) |
176 log.debug('>>> FINISHED BUILDING INDEX <<<') | 172 log.debug('>>> FINISHED BUILDING INDEX <<<') |
177 | 173 |
178 def update_index(self): | 174 def update_index(self): |
179 log.debug('STARTING INCREMENTAL INDEXING UPDATE') | 175 log.debug('STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s' % |
176 INDEX_EXTENSIONS) | |
180 | 177 |
181 idx = open_dir(self.index_location, indexname=self.indexname) | 178 idx = open_dir(self.index_location, indexname=self.indexname) |
182 # The set of all paths in the index | 179 # The set of all paths in the index |
183 indexed_paths = set() | 180 indexed_paths = set() |
184 # The set of all paths we need to re-index | 181 # The set of all paths we need to re-index |
213 to_index.add(indexed_path) | 210 to_index.add(indexed_path) |
214 | 211 |
215 # Loop over the files in the filesystem | 212 # Loop over the files in the filesystem |
216 # Assume we have a function that gathers the filenames of the | 213 # Assume we have a function that gathers the filenames of the |
217 # documents to be indexed | 214 # documents to be indexed |
215 ri_cnt = riwc_cnt = 0 | |
218 for repo_name, repo in self.repo_paths.items(): | 216 for repo_name, repo in self.repo_paths.items(): |
219 for path in self.get_paths(repo): | 217 for path in self.get_paths(repo): |
220 if path in to_index or path not in indexed_paths: | 218 if path in to_index or path not in indexed_paths: |
221 # This is either a file that's changed, or a new file | 219 # This is either a file that's changed, or a new file |
222 # that wasn't indexed before. So index it! | 220 # that wasn't indexed before. So index it! |
223 self.add_doc(writer, path, repo, repo_name) | 221 i, iwc = self.add_doc(writer, path, repo, repo_name) |
224 log.debug('re indexing %s' % path) | 222 log.debug('re indexing %s' % path) |
225 | 223 ri_cnt += i |
224 riwc_cnt += iwc | |
225 log.debug('added %s files %s with content for repo %s' % ( | |
226 ri_cnt + riwc_cnt, riwc_cnt, repo.path) | |
227 ) | |
226 log.debug('>> COMMITING CHANGES <<') | 228 log.debug('>> COMMITING CHANGES <<') |
227 writer.commit(merge=True) | 229 writer.commit(merge=True) |
228 log.debug('>>> FINISHED REBUILDING INDEX <<<') | 230 log.debug('>>> FINISHED REBUILDING INDEX <<<') |
229 | 231 |
230 def run(self, full_index=False): | 232 def run(self, full_index=False): |