comparison rhodecode/lib/indexers/daemon.py @ 2165:dc2584ba5fbc

merged beta into default branch
author Marcin Kuzminski <marcin@python-works.com>
date Wed, 28 Mar 2012 19:54:16 +0200
parents 82a88013a3fd 8ecfed1d8f8b
children 63e58ef80ef1
comparison
equal deleted inserted replaced
2097:8fd6650bb436 2165:dc2584ba5fbc
36 36
37 #to get the rhodecode import 37 #to get the rhodecode import
38 project_path = dn(dn(dn(dn(os.path.realpath(__file__))))) 38 project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
39 sys.path.append(project_path) 39 sys.path.append(project_path)
40 40
41 41 from rhodecode.config.conf import INDEX_EXTENSIONS
42 from rhodecode.model.scm import ScmModel 42 from rhodecode.model.scm import ScmModel
43 from rhodecode.lib import safe_unicode 43 from rhodecode.lib.utils2 import safe_unicode
44 from rhodecode.lib.indexers import INDEX_EXTENSIONS, SCHEMA, IDX_NAME 44 from rhodecode.lib.indexers import SCHEMA, IDX_NAME
45 45
46 from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \ 46 from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
47 NodeDoesNotExistError 47 NodeDoesNotExistError
48 48
49 from whoosh.index import create_in, open_dir 49 from whoosh.index import create_in, open_dir
50 50
51 51 log = logging.getLogger('whoosh_indexer')
52 log = logging.getLogger('whooshIndexer')
53 # create logger
54 log.setLevel(logging.DEBUG)
55 log.propagate = False
56 # create console handler and set level to debug
57 ch = logging.StreamHandler()
58 ch.setLevel(logging.DEBUG)
59
60 # create formatter
61 formatter = logging.Formatter("%(asctime)s - %(name)s -"
62 " %(levelname)s - %(message)s")
63
64 # add formatter to ch
65 ch.setFormatter(formatter)
66
67 # add ch to logger
68 log.addHandler(ch)
69 52
70 53
71 class WhooshIndexingDaemon(object): 54 class WhooshIndexingDaemon(object):
72 """ 55 """
73 Daemon for atomic jobs 56 Daemon for atomic jobs
101 log.info('Cannot run incremental index since it does not' 84 log.info('Cannot run incremental index since it does not'
102 ' yet exist running full build') 85 ' yet exist running full build')
103 self.initial = True 86 self.initial = True
104 87
105 def get_paths(self, repo): 88 def get_paths(self, repo):
106 """recursive walk in root dir and return a set of all path in that dir 89 """
90 recursive walk in root dir and return a set of all path in that dir
107 based on repository walk function 91 based on repository walk function
108 """ 92 """
109 index_paths_ = set() 93 index_paths_ = set()
110 try: 94 try:
111 tip = repo.get_changeset('tip') 95 tip = repo.get_changeset('tip')
125 109
126 def get_node_mtime(self, node): 110 def get_node_mtime(self, node):
127 return mktime(node.last_changeset.date.timetuple()) 111 return mktime(node.last_changeset.date.timetuple())
128 112
129 def add_doc(self, writer, path, repo, repo_name): 113 def add_doc(self, writer, path, repo, repo_name):
130 """Adding doc to writer this function itself fetches data from 114 """
131 the instance of vcs backend""" 115 Adding doc to writer this function itself fetches data from
116 the instance of vcs backend
117 """
118
132 node = self.get_node(repo, path) 119 node = self.get_node(repo, path)
133 120 indexed = indexed_w_content = 0
134 #we just index the content of chosen files, and skip binary files 121 # we just index the content of chosen files, and skip binary files
135 if node.extension in INDEX_EXTENSIONS and not node.is_binary: 122 if node.extension in INDEX_EXTENSIONS and not node.is_binary:
136
137 u_content = node.content 123 u_content = node.content
138 if not isinstance(u_content, unicode): 124 if not isinstance(u_content, unicode):
139 log.warning(' >> %s Could not get this content as unicode ' 125 log.warning(' >> %s Could not get this content as unicode '
140 'replacing with empty content', path) 126 'replacing with empty content' % path)
141 u_content = u'' 127 u_content = u''
142 else: 128 else:
143 log.debug(' >> %s [WITH CONTENT]' % path) 129 log.debug(' >> %s [WITH CONTENT]' % path)
130 indexed_w_content += 1
144 131
145 else: 132 else:
146 log.debug(' >> %s' % path) 133 log.debug(' >> %s' % path)
147 #just index file name without it's content 134 # just index file name without it's content
148 u_content = u'' 135 u_content = u''
149 136 indexed += 1
150 writer.add_document(owner=unicode(repo.contact), 137
151 repository=safe_unicode(repo_name), 138 writer.add_document(
152 path=safe_unicode(path), 139 owner=unicode(repo.contact),
153 content=u_content, 140 repository=safe_unicode(repo_name),
154 modtime=self.get_node_mtime(node), 141 path=safe_unicode(path),
155 extension=node.extension) 142 content=u_content,
143 modtime=self.get_node_mtime(node),
144 extension=node.extension
145 )
146 return indexed, indexed_w_content
156 147
157 def build_index(self): 148 def build_index(self):
158 if os.path.exists(self.index_location): 149 if os.path.exists(self.index_location):
159 log.debug('removing previous index') 150 log.debug('removing previous index')
160 rmtree(self.index_location) 151 rmtree(self.index_location)
162 if not os.path.exists(self.index_location): 153 if not os.path.exists(self.index_location):
163 os.mkdir(self.index_location) 154 os.mkdir(self.index_location)
164 155
165 idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME) 156 idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
166 writer = idx.writer() 157 writer = idx.writer()
167 158 log.debug('BUILDIN INDEX FOR EXTENSIONS %s' % INDEX_EXTENSIONS)
168 for repo_name, repo in self.repo_paths.items(): 159 for repo_name, repo in self.repo_paths.items():
169 log.debug('building index @ %s' % repo.path) 160 log.debug('building index @ %s' % repo.path)
170 161 i_cnt = iwc_cnt = 0
171 for idx_path in self.get_paths(repo): 162 for idx_path in self.get_paths(repo):
172 self.add_doc(writer, idx_path, repo, repo_name) 163 i, iwc = self.add_doc(writer, idx_path, repo, repo_name)
164 i_cnt += i
165 iwc_cnt += iwc
166 log.debug('added %s files %s with content for repo %s' % (
167 i_cnt + iwc_cnt, iwc_cnt, repo.path)
168 )
173 169
174 log.debug('>> COMMITING CHANGES <<') 170 log.debug('>> COMMITING CHANGES <<')
175 writer.commit(merge=True) 171 writer.commit(merge=True)
176 log.debug('>>> FINISHED BUILDING INDEX <<<') 172 log.debug('>>> FINISHED BUILDING INDEX <<<')
177 173
178 def update_index(self): 174 def update_index(self):
179 log.debug('STARTING INCREMENTAL INDEXING UPDATE') 175 log.debug('STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s' %
176 INDEX_EXTENSIONS)
180 177
181 idx = open_dir(self.index_location, indexname=self.indexname) 178 idx = open_dir(self.index_location, indexname=self.indexname)
182 # The set of all paths in the index 179 # The set of all paths in the index
183 indexed_paths = set() 180 indexed_paths = set()
184 # The set of all paths we need to re-index 181 # The set of all paths we need to re-index
213 to_index.add(indexed_path) 210 to_index.add(indexed_path)
214 211
215 # Loop over the files in the filesystem 212 # Loop over the files in the filesystem
216 # Assume we have a function that gathers the filenames of the 213 # Assume we have a function that gathers the filenames of the
217 # documents to be indexed 214 # documents to be indexed
215 ri_cnt = riwc_cnt = 0
218 for repo_name, repo in self.repo_paths.items(): 216 for repo_name, repo in self.repo_paths.items():
219 for path in self.get_paths(repo): 217 for path in self.get_paths(repo):
220 if path in to_index or path not in indexed_paths: 218 if path in to_index or path not in indexed_paths:
221 # This is either a file that's changed, or a new file 219 # This is either a file that's changed, or a new file
222 # that wasn't indexed before. So index it! 220 # that wasn't indexed before. So index it!
223 self.add_doc(writer, path, repo, repo_name) 221 i, iwc = self.add_doc(writer, path, repo, repo_name)
224 log.debug('re indexing %s' % path) 222 log.debug('re indexing %s' % path)
225 223 ri_cnt += i
224 riwc_cnt += iwc
225 log.debug('added %s files %s with content for repo %s' % (
226 ri_cnt + riwc_cnt, riwc_cnt, repo.path)
227 )
226 log.debug('>> COMMITING CHANGES <<') 228 log.debug('>> COMMITING CHANGES <<')
227 writer.commit(merge=True) 229 writer.commit(merge=True)
228 log.debug('>>> FINISHED REBUILDING INDEX <<<') 230 log.debug('>>> FINISHED REBUILDING INDEX <<<')
229 231
230 def run(self, full_index=False): 232 def run(self, full_index=False):