| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | '''
 | 
					
						
							|  |  |  | searx is free software: you can redistribute it and/or modify | 
					
						
							|  |  |  | it under the terms of the GNU Affero General Public License as published by | 
					
						
							|  |  |  | the Free Software Foundation, either version 3 of the License, or | 
					
						
							|  |  |  | (at your option) any later version. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | searx is distributed in the hope that it will be useful, | 
					
						
							|  |  |  | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
					
						
							|  |  |  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
					
						
							|  |  |  | GNU Affero General Public License for more details. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | You should have received a copy of the GNU Affero General Public License | 
					
						
							|  |  |  | along with searx. If not, see < http://www.gnu.org/licenses/ >. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | (C) 2013- by Adam Tauber, <asciimoo@gmail.com> | 
					
						
							|  |  |  | '''
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-06-24 16:30:04 +02:00
										 |  |  | import re | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | from lxml import etree | 
					
						
							|  |  |  | from os import listdir | 
					
						
							| 
									
										
										
										
											2014-10-19 21:39:30 +02:00
										 |  |  | from os.path import isfile, isdir, join | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-06-24 16:30:04 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # https://gitweb.torproject.org/\ | 
					
						
							|  |  |  | # pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # HTTPS rewrite rules | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | https_rules = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # load single ruleset from a xml file | 
					
						
							|  |  |  | def load_single_https_ruleset(filepath): | 
					
						
							|  |  |  |     ruleset = () | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # init parser | 
					
						
							|  |  |  |     parser = etree.XMLParser() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # load and parse xml-file | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         tree = etree.parse(filepath, parser) | 
					
						
							|  |  |  |     except: | 
					
						
							|  |  |  |         # TODO, error message | 
					
						
							|  |  |  |         return () | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # get root node | 
					
						
							|  |  |  |     root = tree.getroot() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # check if root is a node with the name ruleset | 
					
						
							|  |  |  |     # TODO improve parsing | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  |     if root.tag != 'ruleset': | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  |         return () | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # check if rule is deactivated by default | 
					
						
							|  |  |  |     if root.attrib.get('default_off'): | 
					
						
							|  |  |  |         return () | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # check if rule does only work for specific platforms | 
					
						
							|  |  |  |     if root.attrib.get('platform'): | 
					
						
							|  |  |  |         return () | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     hosts = [] | 
					
						
							|  |  |  |     rules = [] | 
					
						
							|  |  |  |     exclusions = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # parse childs from ruleset | 
					
						
							|  |  |  |     for ruleset in root: | 
					
						
							|  |  |  |         # this child define a target | 
					
						
							|  |  |  |         if ruleset.tag == 'target': | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  |             # check if required tags available | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  |             if not ruleset.attrib.get('host'): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # convert host-rule to valid regex | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  |             host = ruleset.attrib.get('host')\ | 
					
						
							|  |  |  |                 .replace('.', '\.').replace('*', '.*') | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |             # append to host list | 
					
						
							|  |  |  |             hosts.append(host) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # this child define a rule | 
					
						
							|  |  |  |         elif ruleset.tag == 'rule': | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  |             # check if required tags available | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  |             if not ruleset.attrib.get('from')\ | 
					
						
							|  |  |  |                or not ruleset.attrib.get('to'): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  |             # TODO hack, which convert a javascript regex group | 
					
						
							|  |  |  |             # into a valid python regex group | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  |             rule_from = ruleset.attrib.get('from').replace('$', '\\') | 
					
						
							|  |  |  |             rule_to = ruleset.attrib.get('to').replace('$', '\\') | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  |             # TODO, not working yet because of the hack above, | 
					
						
							|  |  |  |             # currently doing that in webapp.py | 
					
						
							|  |  |  |             # rule_from_rgx = re.compile(rule_from, re.I) | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |             # append rule | 
					
						
							|  |  |  |             rules.append((rule_from, rule_to)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # this child define an exclusion | 
					
						
							|  |  |  |         elif ruleset.tag == 'exclusion': | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  |             # check if required tags available | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  |             if not ruleset.attrib.get('pattern'): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             exclusion_rgx = re.compile(ruleset.attrib.get('pattern')) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # append exclusion | 
					
						
							|  |  |  |             exclusions.append(exclusion_rgx) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # convert list of possible hosts to a simple regex | 
					
						
							|  |  |  |     # TODO compress regex to improve performance | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U) | 
					
						
							|  |  |  |     except: | 
					
						
							|  |  |  |         return () | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # return ruleset | 
					
						
							|  |  |  |     return (target_hosts, rules, exclusions) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # load all https rewrite rules | 
					
						
							|  |  |  | def load_https_rules(rules_path): | 
					
						
							| 
									
										
										
										
											2014-10-19 21:39:30 +02:00
										 |  |  |     # check if directory exists | 
					
						
							|  |  |  |     if not isdir(rules_path): | 
					
						
							|  |  |  |         print("[E] directory not found: '" + rules_path + "'") | 
					
						
							|  |  |  |         return | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # search all xml files which are stored in the https rule directory | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  |     xml_files = [join(rules_path, f) | 
					
						
							|  |  |  |                  for f in listdir(rules_path) | 
					
						
							|  |  |  |                  if isfile(join(rules_path, f)) and f[-4:] == '.xml'] | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # load xml-files | 
					
						
							|  |  |  |     for ruleset_file in xml_files: | 
					
						
							|  |  |  |         # calculate rewrite-rules | 
					
						
							|  |  |  |         ruleset = load_single_https_ruleset(ruleset_file) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # skip if no ruleset returned | 
					
						
							|  |  |  |         if not ruleset: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # append ruleset | 
					
						
							|  |  |  |         https_rules.append(ruleset) | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-10-15 14:47:03 +02:00
										 |  |  |     print(' * {n} https-rules loaded'.format(n=len(https_rules))) |