| 
							
							'''
 | 
						
						
						
						
							 | 
							
							searx is free software: you can redistribute it and/or modify
 | 
						
						
						
						
							 | 
							
							it under the terms of the GNU Affero General Public License as published by
 | 
						
						
						
						
							 | 
							
							the Free Software Foundation, either version 3 of the License, or
 | 
						
						
						
						
							 | 
							
							(at your option) any later version.
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							searx is distributed in the hope that it will be useful,
 | 
						
						
						
						
							 | 
							
							but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						
						
						
						
							 | 
							
							MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
						
						
						
						
							 | 
							
							GNU Affero General Public License for more details.
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							You should have received a copy of the GNU Affero General Public License
 | 
						
						
						
						
							 | 
							
							along with searx. If not, see < http://www.gnu.org/licenses/ >.
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
 | 
						
						
						
						
							 | 
							
							'''
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							import re
 | 
						
						
						
						
							 | 
							
							from urlparse import urlparse
 | 
						
						
						
						
							 | 
							
							from lxml import etree
 | 
						
						
						
						
							 | 
							
							from os import listdir, environ
 | 
						
						
						
						
							 | 
							
							from os.path import isfile, isdir, join
 | 
						
						
						
						
							 | 
							
							from searx.plugins import logger
 | 
						
						
						
						
							 | 
							
							from flask_babel import gettext
 | 
						
						
						
						
							 | 
							
							from searx import searx_dir
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							name = "HTTPS rewrite"
 | 
						
						
						
						
							 | 
							
							description = gettext('Rewrite HTTP links to HTTPS if possible')
 | 
						
						
						
						
							 | 
							
							default_on = True
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							if 'SEARX_HTTPS_REWRITE_PATH' in environ:
 | 
						
						
						
						
							 | 
							
							    rules_path = environ['SEARX_rules_path']
 | 
						
						
						
						
							 | 
							
							else:
 | 
						
						
						
						
							 | 
							
							    rules_path = join(searx_dir, 'plugins/https_rules')
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							logger = logger.getChild("https_rewrite")
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							# https://gitweb.torproject.org/\
 | 
						
						
						
						
							 | 
							
							# pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							# HTTPS rewrite rules
 | 
						
						
						
						
							 | 
							
							https_rules = []
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							# load single ruleset from a xml file
 | 
						
						
						
						
							 | 
							
							def load_single_https_ruleset(rules_path):
 | 
						
						
						
						
							 | 
							
							    ruleset = ()
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							    # init parser
 | 
						
						
						
						
							 | 
							
							    parser = etree.XMLParser()
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							    # load and parse xml-file
 | 
						
						
						
						
							 | 
							
							    try:
 | 
						
						
						
						
							 | 
							
							        tree = etree.parse(rules_path, parser)
 | 
						
						
						
						
							 | 
							
							    except:
 | 
						
						
						
						
							 | 
							
							        # TODO, error message
 | 
						
						
						
						
							 | 
							
							        return ()
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							    # get root node
 | 
						
						
						
						
							 | 
							
							    root = tree.getroot()
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							    # check if root is a node with the name ruleset
 | 
						
						
						
						
							 | 
							
							    # TODO improve parsing
 | 
						
						
						
						
							 | 
							
							    if root.tag != 'ruleset':
 | 
						
						
						
						
							 | 
							
							        return ()
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							    # check if rule is deactivated by default
 | 
						
						
						
						
							 | 
							
							    if root.attrib.get('default_off'):
 | 
						
						
						
						
							 | 
							
							        return ()
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							    # check if rule does only work for specific platforms
 | 
						
						
						
						
							 | 
							
							    if root.attrib.get('platform'):
 | 
						
						
						
						
							 | 
							
							        return ()
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							    hosts = []
 | 
						
						
						
						
							 | 
							
							    rules = []
 | 
						
						
						
						
							 | 
							
							    exclusions = []
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							    # parse childs from ruleset
 | 
						
						
						
						
							 | 
							
							    for ruleset in root:
 | 
						
						
						
						
							 | 
							
							        # this child define a target
 | 
						
						
						
						
							 | 
							
							        if ruleset.tag == 'target':
 | 
						
						
						
						
							 | 
							
							            # check if required tags available
 | 
						
						
						
						
							 | 
							
							            if not ruleset.attrib.get('host'):
 | 
						
						
						
						
							 | 
							
							                continue
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							            # convert host-rule to valid regex
 | 
						
						
						
						
							 | 
							
							            host = ruleset.attrib.get('host')\
 | 
						
						
						
						
							 | 
							
							                .replace('.', r'\.').replace('*', '.*')
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							            # append to host list
 | 
						
						
						
						
							 | 
							
							            hosts.append(host)
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							        # this child define a rule
 | 
						
						
						
						
							 | 
							
							        elif ruleset.tag == 'rule':
 | 
						
						
						
						
							 | 
							
							            # check if required tags available
 | 
						
						
						
						
							 | 
							
							            if not ruleset.attrib.get('from')\
 | 
						
						
						
						
							 | 
							
							               or not ruleset.attrib.get('to'):
 | 
						
						
						
						
							 | 
							
							                continue
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							            # TODO hack, which convert a javascript regex group
 | 
						
						
						
						
							 | 
							
							            # into a valid python regex group
 | 
						
						
						
						
							 | 
							
							            rule_from = ruleset.attrib['from'].replace('$', '\\')
 | 
						
						
						
						
							 | 
							
							            if rule_from.endswith('\\'):
 | 
						
						
						
						
							 | 
							
							                rule_from = rule_from[:-1] + '$'
 | 
						
						
						
						
							 | 
							
							            rule_to = ruleset.attrib['to'].replace('$', '\\')
 | 
						
						
						
						
							 | 
							
							            if rule_to.endswith('\\'):
 | 
						
						
						
						
							 | 
							
							                rule_to = rule_to[:-1] + '$'
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							            # TODO, not working yet because of the hack above,
 | 
						
						
						
						
							 | 
							
							            # currently doing that in webapp.py
 | 
						
						
						
						
							 | 
							
							            # rule_from_rgx = re.compile(rule_from, re.I)
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							            # append rule
 | 
						
						
						
						
							 | 
							
							            try:
 | 
						
						
						
						
							 | 
							
							                rules.append((re.compile(rule_from, re.I | re.U), rule_to))
 | 
						
						
						
						
							 | 
							
							            except:
 | 
						
						
						
						
							 | 
							
							                # TODO log regex error
 | 
						
						
						
						
							 | 
							
							                continue
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							        # this child define an exclusion
 | 
						
						
						
						
							 | 
							
							        elif ruleset.tag == 'exclusion':
 | 
						
						
						
						
							 | 
							
							            # check if required tags available
 | 
						
						
						
						
							 | 
							
							            if not ruleset.attrib.get('pattern'):
 | 
						
						
						
						
							 | 
							
							                continue
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							            exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							            # append exclusion
 | 
						
						
						
						
							 | 
							
							            exclusions.append(exclusion_rgx)
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							    # convert list of possible hosts to a simple regex
 | 
						
						
						
						
							 | 
							
							    # TODO compress regex to improve performance
 | 
						
						
						
						
							 | 
							
							    try:
 | 
						
						
						
						
							 | 
							
							        target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
 | 
						
						
						
						
							 | 
							
							    except:
 | 
						
						
						
						
							 | 
							
							        return ()
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							    # return ruleset
 | 
						
						
						
						
							 | 
							
							    return (target_hosts, rules, exclusions)
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							# load all https rewrite rules
 | 
						
						
						
						
							 | 
							
							def load_https_rules(rules_path):
 | 
						
						
						
						
							 | 
							
							    # check if directory exists
 | 
						
						
						
						
							 | 
							
							    if not isdir(rules_path):
 | 
						
						
						
						
							 | 
							
							        logger.error("directory not found: '" + rules_path + "'")
 | 
						
						
						
						
							 | 
							
							        return
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							    # search all xml files which are stored in the https rule directory
 | 
						
						
						
						
							 | 
							
							    xml_files = [join(rules_path, f)
 | 
						
						
						
						
							 | 
							
							                 for f in listdir(rules_path)
 | 
						
						
						
						
							 | 
							
							                 if isfile(join(rules_path, f)) and f[-4:] == '.xml']
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							    # load xml-files
 | 
						
						
						
						
							 | 
							
							    for ruleset_file in xml_files:
 | 
						
						
						
						
							 | 
							
							        # calculate rewrite-rules
 | 
						
						
						
						
							 | 
							
							        ruleset = load_single_https_ruleset(ruleset_file)
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							        # skip if no ruleset returned
 | 
						
						
						
						
							 | 
							
							        if not ruleset:
 | 
						
						
						
						
							 | 
							
							            continue
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							        # append ruleset
 | 
						
						
						
						
							 | 
							
							        https_rules.append(ruleset)
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							    logger.info('{n} rules loaded'.format(n=len(https_rules)))
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							def https_url_rewrite(result):
 | 
						
						
						
						
							 | 
							
							    skip_https_rewrite = False
 | 
						
						
						
						
							 | 
							
							    # check if HTTPS rewrite is possible
 | 
						
						
						
						
							 | 
							
							    for target, rules, exclusions in https_rules:
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							        # check if target regex match with url
 | 
						
						
						
						
							 | 
							
							        if target.match(result['parsed_url'].netloc):
 | 
						
						
						
						
							 | 
							
							            # process exclusions
 | 
						
						
						
						
							 | 
							
							            for exclusion in exclusions:
 | 
						
						
						
						
							 | 
							
							                # check if exclusion match with url
 | 
						
						
						
						
							 | 
							
							                if exclusion.match(result['url']):
 | 
						
						
						
						
							 | 
							
							                    skip_https_rewrite = True
 | 
						
						
						
						
							 | 
							
							                    break
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							            # skip https rewrite if required
 | 
						
						
						
						
							 | 
							
							            if skip_https_rewrite:
 | 
						
						
						
						
							 | 
							
							                break
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							            # process rules
 | 
						
						
						
						
							 | 
							
							            for rule in rules:
 | 
						
						
						
						
							 | 
							
							                try:
 | 
						
						
						
						
							 | 
							
							                    new_result_url = rule[0].sub(rule[1], result['url'])
 | 
						
						
						
						
							 | 
							
							                except:
 | 
						
						
						
						
							 | 
							
							                    break
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							                # parse new url
 | 
						
						
						
						
							 | 
							
							                new_parsed_url = urlparse(new_result_url)
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							                # continiue if nothing was rewritten
 | 
						
						
						
						
							 | 
							
							                if result['url'] == new_result_url:
 | 
						
						
						
						
							 | 
							
							                    continue
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							                # get domainname from result
 | 
						
						
						
						
							 | 
							
							                # TODO, does only work correct with TLD's like
 | 
						
						
						
						
							 | 
							
							                #  asdf.com, not for asdf.com.de
 | 
						
						
						
						
							 | 
							
							                # TODO, using publicsuffix instead of this rewrite rule
 | 
						
						
						
						
							 | 
							
							                old_result_domainname = '.'.join(
 | 
						
						
						
						
							 | 
							
							                    result['parsed_url'].hostname.split('.')[-2:])
 | 
						
						
						
						
							 | 
							
							                new_result_domainname = '.'.join(
 | 
						
						
						
						
							 | 
							
							                    new_parsed_url.hostname.split('.')[-2:])
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							                # check if rewritten hostname is the same,
 | 
						
						
						
						
							 | 
							
							                # to protect against wrong or malicious rewrite rules
 | 
						
						
						
						
							 | 
							
							                if old_result_domainname == new_result_domainname:
 | 
						
						
						
						
							 | 
							
							                    # set new url
 | 
						
						
						
						
							 | 
							
							                    result['url'] = new_result_url
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							            # target has matched, do not search over the other rules
 | 
						
						
						
						
							 | 
							
							            break
 | 
						
						
						
						
							 | 
							
							    return result
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							def on_result(request, search, result):
 | 
						
						
						
						
							 | 
							
							    if result['parsed_url'].scheme == 'http':
 | 
						
						
						
						
							 | 
							
							        https_url_rewrite(result)
 | 
						
						
						
						
							 | 
							
							    return True
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							
 | 
						
						
						
						
							 | 
							
							load_https_rules(rules_path)
 |