Mercurial > kallithea
comparison pylons_app/lib/indexers/daemon.py @ 406:b153a51b1d3b
Implemented search using whoosh. Still as experimental option.
author | Marcin Kuzminski <marcin@python-works.com> |
---|---|
date | Tue, 17 Aug 2010 23:15:36 +0200 |
parents | |
children | 0c9dfae57107 |
comparison
equal
deleted
inserted
replaced
405:bec06654d67b | 406:b153a51b1d3b |
---|---|
1 #!/usr/bin/env python | |
2 # encoding: utf-8 | |
3 # whoosh indexer daemon for hg-app | |
4 # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com> | |
5 # | |
6 # This program is free software; you can redistribute it and/or | |
7 # modify it under the terms of the GNU General Public License | |
8 # as published by the Free Software Foundation; version 2 | |
9 # of the License or (at your opinion) any later version of the license. | |
10 # | |
11 # This program is distributed in the hope that it will be useful, | |
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 # GNU General Public License for more details. | |
15 # | |
16 # You should have received a copy of the GNU General Public License | |
17 # along with this program; if not, write to the Free Software | |
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
19 # MA 02110-1301, USA. | |
20 """ | |
21 Created on Jan 26, 2010 | |
22 | |
23 @author: marcink | |
24 A deamon will read from task table and run tasks | |
25 """ | |
26 import sys | |
27 import os | |
28 from pidlock import LockHeld, DaemonLock | |
29 import traceback | |
30 | |
31 from os.path import dirname as dn | |
32 from os.path import join as jn | |
33 | |
34 #to get the pylons_app import | |
35 sys.path.append(dn(dn(dn(os.path.realpath(__file__))))) | |
36 | |
37 from pylons_app.config.environment import load_environment | |
38 from pylons_app.model.hg_model import HgModel | |
39 from whoosh.index import create_in, open_dir | |
40 from shutil import rmtree | |
41 from pylons_app.lib.indexers import ANALYZER, EXCLUDE_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME | |
42 import logging | |
43 log = logging.getLogger(__name__) | |
44 | |
45 | |
46 location = '/home/marcink/python_workspace_dirty/*' | |
47 | |
48 def scan_paths(root_location): | |
49 return HgModel.repo_scan('/', root_location, None, True) | |
50 | |
51 class WhooshIndexingDaemon(object): | |
52 """Deamon for atomic jobs""" | |
53 | |
54 def __init__(self, indexname='HG_INDEX'): | |
55 self.indexname = indexname | |
56 | |
57 | |
58 def get_paths(self, root_dir): | |
59 """recursive walk in root dir and return a set of all path in that dir | |
60 excluding files in .hg dir""" | |
61 index_paths_ = set() | |
62 for path, dirs, files in os.walk(root_dir): | |
63 if path.find('.hg') == -1: | |
64 for f in files: | |
65 index_paths_.add(jn(path, f)) | |
66 | |
67 return index_paths_ | |
68 | |
69 def add_doc(self, writer, path, repo): | |
70 """Adding doc to writer""" | |
71 | |
72 #we don't won't to read excluded file extensions just index them | |
73 if path.split('/')[-1].split('.')[-1].lower() not in EXCLUDE_EXTENSIONS: | |
74 fobj = open(path, 'rb') | |
75 content = fobj.read() | |
76 fobj.close() | |
77 try: | |
78 u_content = unicode(content) | |
79 except UnicodeDecodeError: | |
80 #incase we have a decode error just represent as byte string | |
81 u_content = unicode(str(content).encode('string_escape')) | |
82 else: | |
83 u_content = u'' | |
84 writer.add_document(owner=unicode(repo.contact), | |
85 repository=u"%s" % repo.name, | |
86 path=u"%s" % path, | |
87 content=u_content, | |
88 modtime=os.path.getmtime(path)) | |
89 | |
90 def build_index(self): | |
91 if os.path.exists(IDX_LOCATION): | |
92 rmtree(IDX_LOCATION) | |
93 | |
94 if not os.path.exists(IDX_LOCATION): | |
95 os.mkdir(IDX_LOCATION) | |
96 | |
97 idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME) | |
98 writer = idx.writer() | |
99 | |
100 for cnt, repo in enumerate(scan_paths(location).values()): | |
101 log.debug('building index @ %s' % repo.path) | |
102 | |
103 for idx_path in self.get_paths(repo.path): | |
104 log.debug(' >> %s' % idx_path) | |
105 self.add_doc(writer, idx_path, repo) | |
106 writer.commit(merge=True) | |
107 | |
108 log.debug('>>> FINISHED BUILDING INDEX <<<') | |
109 | |
110 | |
111 def update_index(self): | |
112 log.debug('STARTING INCREMENTAL INDEXING UPDATE') | |
113 | |
114 idx = open_dir(IDX_LOCATION, indexname=self.indexname) | |
115 # The set of all paths in the index | |
116 indexed_paths = set() | |
117 # The set of all paths we need to re-index | |
118 to_index = set() | |
119 | |
120 reader = idx.reader() | |
121 writer = idx.writer() | |
122 | |
123 # Loop over the stored fields in the index | |
124 for fields in reader.all_stored_fields(): | |
125 indexed_path = fields['path'] | |
126 indexed_paths.add(indexed_path) | |
127 | |
128 if not os.path.exists(indexed_path): | |
129 # This file was deleted since it was indexed | |
130 log.debug('removing from index %s' % indexed_path) | |
131 writer.delete_by_term('path', indexed_path) | |
132 | |
133 else: | |
134 # Check if this file was changed since it | |
135 # was indexed | |
136 indexed_time = fields['modtime'] | |
137 | |
138 mtime = os.path.getmtime(indexed_path) | |
139 | |
140 if mtime > indexed_time: | |
141 | |
142 # The file has changed, delete it and add it to the list of | |
143 # files to reindex | |
144 log.debug('adding to reindex list %s' % indexed_path) | |
145 writer.delete_by_term('path', indexed_path) | |
146 to_index.add(indexed_path) | |
147 #writer.commit() | |
148 | |
149 # Loop over the files in the filesystem | |
150 # Assume we have a function that gathers the filenames of the | |
151 # documents to be indexed | |
152 for repo in scan_paths(location).values(): | |
153 for path in self.get_paths(repo.path): | |
154 if path in to_index or path not in indexed_paths: | |
155 # This is either a file that's changed, or a new file | |
156 # that wasn't indexed before. So index it! | |
157 self.add_doc(writer, path, repo) | |
158 log.debug('reindexing %s' % path) | |
159 | |
160 writer.commit(merge=True) | |
161 #idx.optimize() | |
162 log.debug('>>> FINISHED <<<') | |
163 | |
164 def run(self, full_index=False): | |
165 """Run daemon""" | |
166 if full_index: | |
167 self.build_index() | |
168 else: | |
169 self.update_index() | |
170 | |
171 if __name__ == "__main__": | |
172 | |
173 #config = load_environment() | |
174 #print config | |
175 try: | |
176 l = DaemonLock() | |
177 WhooshIndexingDaemon().run(full_index=True) | |
178 l.release() | |
179 except LockHeld: | |
180 sys.exit(1) | |
181 |