| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | '''
 | 
					
						
							|  |  |  | searx is free software: you can redistribute it and/or modify | 
					
						
							|  |  |  | it under the terms of the GNU Affero General Public License as published by | 
					
						
							|  |  |  | the Free Software Foundation, either version 3 of the License, or | 
					
						
							|  |  |  | (at your option) any later version. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | searx is distributed in the hope that it will be useful, | 
					
						
							|  |  |  | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
					
						
							|  |  |  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
					
						
							|  |  |  | GNU Affero General Public License for more details. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | You should have received a copy of the GNU Affero General Public License | 
					
						
							|  |  |  | along with searx. If not, see < http://www.gnu.org/licenses/ >. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | (C) 2013- by Adam Tauber, <asciimoo@gmail.com> | 
					
						
							|  |  |  | '''
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-06-24 16:30:04 +02:00
										 |  |  | import re | 
					
						
							| 
									
										
										
										
											2014-12-19 22:40:37 +01:00
										 |  |  | from urlparse import urlparse | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | from lxml import etree | 
					
						
							| 
									
										
										
										
											2015-04-13 00:30:12 +02:00
										 |  |  | from os import listdir, environ | 
					
						
							| 
									
										
										
										
											2014-10-19 21:39:30 +02:00
										 |  |  | from os.path import isfile, isdir, join | 
					
						
							| 
									
										
										
										
											2015-04-13 00:30:12 +02:00
										 |  |  | from searx.plugins import logger | 
					
						
							| 
									
										
										
										
											2016-07-04 22:46:43 +02:00
										 |  |  | from flask_babel import gettext | 
					
						
							| 
									
										
										
										
											2015-04-13 00:30:12 +02:00
										 |  |  | from searx import searx_dir | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-06-24 16:30:04 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-04-13 00:30:12 +02:00
										 |  |  | name = "HTTPS rewrite" | 
					
						
							|  |  |  | description = gettext('Rewrite HTTP links to HTTPS if possible') | 
					
						
							|  |  |  | default_on = True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if 'SEARX_HTTPS_REWRITE_PATH' in environ: | 
					
						
							|  |  |  |     rules_path = environ['SEARX_rules_path'] | 
					
						
							|  |  |  | else: | 
					
						
							|  |  |  |     rules_path = join(searx_dir, 'plugins/https_rules') | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-01-09 04:13:05 +01:00
										 |  |  | logger = logger.getChild("https_rewrite") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-06-24 16:30:04 +02:00
										 |  |  | # https://gitweb.torproject.org/\ | 
					
						
							|  |  |  | # pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # HTTPS rewrite rules | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | https_rules = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # load single ruleset from a xml file | 
					
						
							| 
									
										
										
										
											2015-04-13 00:30:12 +02:00
										 |  |  | def load_single_https_ruleset(rules_path): | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  |     ruleset = () | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # init parser | 
					
						
							|  |  |  |     parser = etree.XMLParser() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # load and parse xml-file | 
					
						
							|  |  |  |     try: | 
					
						
							| 
									
										
										
										
											2015-04-13 00:30:12 +02:00
										 |  |  |         tree = etree.parse(rules_path, parser) | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  |     except: | 
					
						
							|  |  |  |         # TODO, error message | 
					
						
							|  |  |  |         return () | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # get root node | 
					
						
							|  |  |  |     root = tree.getroot() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # check if root is a node with the name ruleset | 
					
						
							|  |  |  |     # TODO improve parsing | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  |     if root.tag != 'ruleset': | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  |         return () | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # check if rule is deactivated by default | 
					
						
							|  |  |  |     if root.attrib.get('default_off'): | 
					
						
							|  |  |  |         return () | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # check if rule does only work for specific platforms | 
					
						
							|  |  |  |     if root.attrib.get('platform'): | 
					
						
							|  |  |  |         return () | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     hosts = [] | 
					
						
							|  |  |  |     rules = [] | 
					
						
							|  |  |  |     exclusions = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # parse childs from ruleset | 
					
						
							|  |  |  |     for ruleset in root: | 
					
						
							|  |  |  |         # this child define a target | 
					
						
							|  |  |  |         if ruleset.tag == 'target': | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  |             # check if required tags available | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  |             if not ruleset.attrib.get('host'): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # convert host-rule to valid regex | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  |             host = ruleset.attrib.get('host')\ | 
					
						
							| 
									
										
										
										
											2016-07-11 15:29:47 +02:00
										 |  |  |                 .replace('.', r'\.').replace('*', '.*') | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |             # append to host list | 
					
						
							|  |  |  |             hosts.append(host) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # this child define a rule | 
					
						
							|  |  |  |         elif ruleset.tag == 'rule': | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  |             # check if required tags available | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  |             if not ruleset.attrib.get('from')\ | 
					
						
							|  |  |  |                or not ruleset.attrib.get('to'): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  |             # TODO hack, which convert a javascript regex group | 
					
						
							|  |  |  |             # into a valid python regex group | 
					
						
							| 
									
										
										
										
											2014-12-19 22:40:37 +01:00
										 |  |  |             rule_from = ruleset.attrib['from'].replace('$', '\\') | 
					
						
							|  |  |  |             if rule_from.endswith('\\'): | 
					
						
							| 
									
										
										
										
											2016-01-18 12:47:31 +01:00
										 |  |  |                 rule_from = rule_from[:-1] + '$' | 
					
						
							| 
									
										
										
										
											2014-12-19 22:40:37 +01:00
										 |  |  |             rule_to = ruleset.attrib['to'].replace('$', '\\') | 
					
						
							|  |  |  |             if rule_to.endswith('\\'): | 
					
						
							| 
									
										
										
										
											2016-01-18 12:47:31 +01:00
										 |  |  |                 rule_to = rule_to[:-1] + '$' | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  |             # TODO, not working yet because of the hack above, | 
					
						
							|  |  |  |             # currently doing that in webapp.py | 
					
						
							|  |  |  |             # rule_from_rgx = re.compile(rule_from, re.I) | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |             # append rule | 
					
						
							| 
									
										
										
										
											2014-12-19 22:40:37 +01:00
										 |  |  |             try: | 
					
						
							|  |  |  |                 rules.append((re.compile(rule_from, re.I | re.U), rule_to)) | 
					
						
							|  |  |  |             except: | 
					
						
							|  |  |  |                 # TODO log regex error | 
					
						
							|  |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # this child define an exclusion | 
					
						
							|  |  |  |         elif ruleset.tag == 'exclusion': | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  |             # check if required tags available | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  |             if not ruleset.attrib.get('pattern'): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             exclusion_rgx = re.compile(ruleset.attrib.get('pattern')) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # append exclusion | 
					
						
							|  |  |  |             exclusions.append(exclusion_rgx) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # convert list of possible hosts to a simple regex | 
					
						
							|  |  |  |     # TODO compress regex to improve performance | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U) | 
					
						
							|  |  |  |     except: | 
					
						
							|  |  |  |         return () | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # return ruleset | 
					
						
							|  |  |  |     return (target_hosts, rules, exclusions) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # load all https rewrite rules | 
					
						
							|  |  |  | def load_https_rules(rules_path): | 
					
						
							| 
									
										
										
										
											2014-10-19 21:39:30 +02:00
										 |  |  |     # check if directory exists | 
					
						
							|  |  |  |     if not isdir(rules_path): | 
					
						
							| 
									
										
										
										
											2015-01-09 04:13:05 +01:00
										 |  |  |         logger.error("directory not found: '" + rules_path + "'") | 
					
						
							| 
									
										
										
										
											2014-10-19 21:39:30 +02:00
										 |  |  |         return | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # search all xml files which are stored in the https rule directory | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  |     xml_files = [join(rules_path, f) | 
					
						
							|  |  |  |                  for f in listdir(rules_path) | 
					
						
							|  |  |  |                  if isfile(join(rules_path, f)) and f[-4:] == '.xml'] | 
					
						
							| 
									
										
										
										
											2014-09-14 11:09:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # load xml-files | 
					
						
							|  |  |  |     for ruleset_file in xml_files: | 
					
						
							|  |  |  |         # calculate rewrite-rules | 
					
						
							|  |  |  |         ruleset = load_single_https_ruleset(ruleset_file) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # skip if no ruleset returned | 
					
						
							|  |  |  |         if not ruleset: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # append ruleset | 
					
						
							|  |  |  |         https_rules.append(ruleset) | 
					
						
							| 
									
										
										
										
											2014-10-19 12:18:21 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-01-09 04:13:05 +01:00
										 |  |  |     logger.info('{n} rules loaded'.format(n=len(https_rules))) | 
					
						
							| 
									
										
										
										
											2014-12-19 22:40:37 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def https_url_rewrite(result): | 
					
						
							|  |  |  |     skip_https_rewrite = False | 
					
						
							|  |  |  |     # check if HTTPS rewrite is possible | 
					
						
							|  |  |  |     for target, rules, exclusions in https_rules: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # check if target regex match with url | 
					
						
							|  |  |  |         if target.match(result['parsed_url'].netloc): | 
					
						
							|  |  |  |             # process exclusions | 
					
						
							|  |  |  |             for exclusion in exclusions: | 
					
						
							|  |  |  |                 # check if exclusion match with url | 
					
						
							|  |  |  |                 if exclusion.match(result['url']): | 
					
						
							|  |  |  |                     skip_https_rewrite = True | 
					
						
							|  |  |  |                     break | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # skip https rewrite if required | 
					
						
							|  |  |  |             if skip_https_rewrite: | 
					
						
							|  |  |  |                 break | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # process rules | 
					
						
							|  |  |  |             for rule in rules: | 
					
						
							|  |  |  |                 try: | 
					
						
							|  |  |  |                     new_result_url = rule[0].sub(rule[1], result['url']) | 
					
						
							|  |  |  |                 except: | 
					
						
							|  |  |  |                     break | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # parse new url | 
					
						
							|  |  |  |                 new_parsed_url = urlparse(new_result_url) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # continiue if nothing was rewritten | 
					
						
							|  |  |  |                 if result['url'] == new_result_url: | 
					
						
							|  |  |  |                     continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # get domainname from result | 
					
						
							|  |  |  |                 # TODO, does only work correct with TLD's like | 
					
						
							|  |  |  |                 #  asdf.com, not for asdf.com.de | 
					
						
							|  |  |  |                 # TODO, using publicsuffix instead of this rewrite rule | 
					
						
							|  |  |  |                 old_result_domainname = '.'.join( | 
					
						
							|  |  |  |                     result['parsed_url'].hostname.split('.')[-2:]) | 
					
						
							|  |  |  |                 new_result_domainname = '.'.join( | 
					
						
							|  |  |  |                     new_parsed_url.hostname.split('.')[-2:]) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # check if rewritten hostname is the same, | 
					
						
							|  |  |  |                 # to protect against wrong or malicious rewrite rules | 
					
						
							|  |  |  |                 if old_result_domainname == new_result_domainname: | 
					
						
							|  |  |  |                     # set new url | 
					
						
							|  |  |  |                     result['url'] = new_result_url | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # target has matched, do not search over the other rules | 
					
						
							|  |  |  |             break | 
					
						
							|  |  |  |     return result | 
					
						
							| 
									
										
										
										
											2015-04-13 00:30:12 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-10-22 14:01:53 +02:00
										 |  |  | def on_result(request, search, result): | 
					
						
							| 
									
										
										
										
											2015-04-13 00:30:12 +02:00
										 |  |  |     if result['parsed_url'].scheme == 'http': | 
					
						
							|  |  |  |         https_url_rewrite(result) | 
					
						
							|  |  |  |     return True | 
					
						
							| 
									
										
										
										
											2015-04-13 00:40:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | load_https_rules(rules_path) |