| 
									
										
										
										
											2013-11-18 16:47:20 +01:00
										 |  |  | #import htmlentitydefs | 
					
						
							| 
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 |  |  | from codecs import getincrementalencoder | 
					
						
							| 
									
										
										
										
											2014-04-25 01:46:40 +02:00
										 |  |  | from HTMLParser import HTMLParser | 
					
						
							|  |  |  | from random import choice | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-15 18:55:18 +01:00
										 |  |  | import cStringIO | 
					
						
							| 
									
										
										
										
											2014-04-25 01:46:40 +02:00
										 |  |  | import csv | 
					
						
							|  |  |  | import os | 
					
						
							| 
									
										
										
										
											2014-01-10 23:38:08 +01:00
										 |  |  | import re | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-03-04 14:20:37 +01:00
										 |  |  | ua_versions = ('26.0', '27.0', '28.0') | 
					
						
							| 
									
										
										
										
											2014-05-20 16:55:49 +02:00
										 |  |  | ua_os = ('Windows NT 6.3; WOW64', | 
					
						
							|  |  |  |          'X11; Linux x86_64', | 
					
						
							|  |  |  |          'X11; Linux x86') | 
					
						
							| 
									
										
										
										
											2014-03-04 14:20:37 +01:00
										 |  |  | ua = "Mozilla/5.0 ({os}) Gecko/20100101 Firefox/{version}" | 
					
						
							| 
									
										
										
										
											2014-01-19 22:59:01 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-03-04 19:26:09 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-18 21:53:59 +01:00
										 |  |  | def gen_useragent(): | 
					
						
							| 
									
										
										
										
											2014-01-12 20:13:14 +01:00
										 |  |  |     # TODO | 
					
						
							| 
									
										
										
										
											2014-03-04 14:20:37 +01:00
										 |  |  |     return ua.format(os=choice(ua_os), version=choice(ua_versions)) | 
					
						
							| 
									
										
										
										
											2014-01-12 20:13:14 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-19 22:59:01 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-10 23:38:08 +01:00
										 |  |  | def highlight_content(content, query): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if not content: | 
					
						
							|  |  |  |         return None | 
					
						
							|  |  |  |     # ignoring html contents | 
					
						
							|  |  |  |     # TODO better html content detection | 
					
						
							|  |  |  |     if content.find('<') != -1: | 
					
						
							|  |  |  |         return content | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     query = query.decode('utf-8') | 
					
						
							|  |  |  |     if content.lower().find(query.lower()) > -1: | 
					
						
							|  |  |  |         query_regex = u'({0})'.format(re.escape(query)) | 
					
						
							| 
									
										
										
										
											2014-05-16 16:51:23 +02:00
										 |  |  |         content = re.sub(query_regex, '<span class="highlight">\\1</span>', | 
					
						
							|  |  |  |                          content, flags=re.I | re.U) | 
					
						
							| 
									
										
										
										
											2014-01-10 23:38:08 +01:00
										 |  |  |     else: | 
					
						
							|  |  |  |         regex_parts = [] | 
					
						
							|  |  |  |         for chunk in query.split(): | 
					
						
							|  |  |  |             if len(chunk) == 1: | 
					
						
							|  |  |  |                 regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk))) | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 regex_parts.append(u'{0}'.format(re.escape(chunk))) | 
					
						
							|  |  |  |         query_regex = u'({0})'.format('|'.join(regex_parts)) | 
					
						
							| 
									
										
										
										
											2014-05-16 16:51:23 +02:00
										 |  |  |         content = re.sub(query_regex, '<span class="highlight">\\1</span>', | 
					
						
							|  |  |  |                          content, flags=re.I | re.U) | 
					
						
							| 
									
										
										
										
											2014-01-10 23:38:08 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     return content | 
					
						
							| 
									
										
										
										
											2013-11-08 23:44:26 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-19 22:59:01 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-08 23:44:26 +01:00
										 |  |  | class HTMLTextExtractor(HTMLParser): | 
					
						
							|  |  |  |     def __init__(self): | 
					
						
							|  |  |  |         HTMLParser.__init__(self) | 
					
						
							| 
									
										
										
										
											2014-01-19 22:59:01 +01:00
										 |  |  |         self.result = [] | 
					
						
							| 
									
										
										
										
											2013-11-08 23:44:26 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def handle_data(self, d): | 
					
						
							|  |  |  |         self.result.append(d) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def handle_charref(self, number): | 
					
						
							| 
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 |  |  |         if number[0] in (u'x', u'X'): | 
					
						
							|  |  |  |             codepoint = int(number[1:], 16) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             codepoint = int(number) | 
					
						
							| 
									
										
										
										
											2013-11-08 23:44:26 +01:00
										 |  |  |         self.result.append(unichr(codepoint)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def handle_entityref(self, name): | 
					
						
							| 
									
										
										
										
											2013-11-18 16:47:20 +01:00
										 |  |  |         #codepoint = htmlentitydefs.name2codepoint[name] | 
					
						
							|  |  |  |         #self.result.append(unichr(codepoint)) | 
					
						
							|  |  |  |         self.result.append(name) | 
					
						
							| 
									
										
										
										
											2013-11-08 23:44:26 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def get_text(self): | 
					
						
							|  |  |  |         return u''.join(self.result) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-19 22:59:01 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-08 23:44:26 +01:00
										 |  |  | def html_to_text(html): | 
					
						
							|  |  |  |     s = HTMLTextExtractor() | 
					
						
							|  |  |  |     s.feed(html) | 
					
						
							|  |  |  |     return s.get_text() | 
					
						
							| 
									
										
										
										
											2013-11-15 18:55:18 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class UnicodeWriter: | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     A CSV writer which will write rows to CSV file "f", | 
					
						
							|  |  |  |     which is encoded in the given encoding. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): | 
					
						
							|  |  |  |         # Redirect output to a queue | 
					
						
							|  |  |  |         self.queue = cStringIO.StringIO() | 
					
						
							|  |  |  |         self.writer = csv.writer(self.queue, dialect=dialect, **kwds) | 
					
						
							|  |  |  |         self.stream = f | 
					
						
							| 
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 |  |  |         self.encoder = getincrementalencoder(encoding)() | 
					
						
							| 
									
										
										
										
											2013-11-15 18:55:18 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def writerow(self, row): | 
					
						
							| 
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 |  |  |         unicode_row = [] | 
					
						
							|  |  |  |         for col in row: | 
					
						
							|  |  |  |             if type(col) == str or type(col) == unicode: | 
					
						
							|  |  |  |                 unicode_row.append(col.encode('utf-8').strip()) | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 unicode_row.append(col) | 
					
						
							|  |  |  |         self.writer.writerow(unicode_row) | 
					
						
							| 
									
										
										
										
											2013-11-15 18:55:18 +01:00
										 |  |  |         # Fetch UTF-8 output from the queue ... | 
					
						
							|  |  |  |         data = self.queue.getvalue() | 
					
						
							|  |  |  |         data = data.decode("utf-8") | 
					
						
							|  |  |  |         # ... and reencode it into the target encoding | 
					
						
							|  |  |  |         data = self.encoder.encode(data) | 
					
						
							|  |  |  |         # write to the target stream | 
					
						
							|  |  |  |         self.stream.write(data) | 
					
						
							|  |  |  |         # empty queue | 
					
						
							|  |  |  |         self.queue.truncate(0) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def writerows(self, rows): | 
					
						
							|  |  |  |         for row in rows: | 
					
						
							|  |  |  |             self.writerow(row) | 
					
						
							| 
									
										
										
										
											2014-04-25 01:46:40 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def get_themes(root): | 
					
						
							|  |  |  |     """Returns available themes list.""" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     static_path = os.path.join(root, 'static') | 
					
						
							|  |  |  |     static_names = set(os.listdir(static_path)) | 
					
						
							|  |  |  |     templates_path = os.path.join(root, 'templates') | 
					
						
							|  |  |  |     templates_names = set(os.listdir(templates_path)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     themes = [] | 
					
						
							|  |  |  |     for name in static_names.intersection(templates_names): | 
					
						
							|  |  |  |         themes += [name] | 
					
						
							|  |  |  |     return static_path, templates_path, themes |