Merge pull request #2241 from dalf/move-extract-text-and-url
Move the extract_text and extract_url functions to searx.utils
This commit is contained in:
		
						commit
						b728cb610b
					
				| @ -1,7 +1,6 @@ | |||||||
| from urllib.parse import quote, urljoin | from urllib.parse import quote, urljoin | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size | ||||||
| from searx.utils import get_torrent_size |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| url = 'https://1337x.to/' | url = 'https://1337x.to/' | ||||||
|  | |||||||
| @ -11,8 +11,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size, int_or_zero | ||||||
| from searx.utils import get_torrent_size, int_or_zero |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['files', 'images', 'videos', 'music'] | categories = ['files', 'images', 'videos', 'music'] | ||||||
|  | |||||||
| @ -11,7 +11,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
|  | |||||||
| @ -13,7 +13,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode, urljoin | from urllib.parse import urlencode, urljoin | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['it'] | categories = ['it'] | ||||||
|  | |||||||
| @ -17,8 +17,7 @@ import re | |||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx import logger, utils | from searx import logger, utils | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, match_language, gen_useragent, eval_xpath | ||||||
| from searx.utils import match_language, gen_useragent, eval_xpath |  | ||||||
| 
 | 
 | ||||||
| logger = logger.getChild('bing engine') | logger = logger.getChild('bing engine') | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -13,8 +13,7 @@ | |||||||
| from lxml import html | from lxml import html | ||||||
| from operator import itemgetter | from operator import itemgetter | ||||||
| from urllib.parse import quote, urljoin | from urllib.parse import quote, urljoin | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size | ||||||
| from searx.utils import get_torrent_size |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['videos', 'music', 'files'] | categories = ['videos', 'music', 'files'] | ||||||
|  | |||||||
| @ -15,7 +15,7 @@ | |||||||
| from lxml import html | from lxml import html | ||||||
| import re | import re | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
|  | |||||||
| @ -12,8 +12,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import urljoin | from urllib.parse import urljoin | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size | ||||||
| from searx.utils import get_torrent_size |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| categories = ['videos', 'music', 'files'] | categories = ['videos', 'music', 'files'] | ||||||
|  | |||||||
| @ -11,8 +11,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml.html import fromstring | from lxml.html import fromstring | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, eval_xpath | ||||||
| from searx.utils import eval_xpath |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['general']  # TODO , 'images', 'music', 'videos', 'files' | categories = ['general']  # TODO , 'images', 'music', 'videos', 'files' | ||||||
|  | |||||||
| @ -16,9 +16,8 @@ | |||||||
| from lxml.html import fromstring | from lxml.html import fromstring | ||||||
| from json import loads | from json import loads | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from searx.engines.xpath import extract_text |  | ||||||
| from searx.poolrequests import get | from searx.poolrequests import get | ||||||
| from searx.utils import match_language, eval_xpath | from searx.utils import extract_text, match_language, eval_xpath | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['general'] | categories = ['general'] | ||||||
|  | |||||||
| @ -13,9 +13,8 @@ import json | |||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from re import compile | from re import compile | ||||||
| from searx.engines.xpath import extract_text |  | ||||||
| from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases | from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases | ||||||
| from searx.utils import html_to_text, match_language | from searx.utils import extract_text, html_to_text, match_language | ||||||
| 
 | 
 | ||||||
| url = 'https://api.duckduckgo.com/'\ | url = 'https://api.duckduckgo.com/'\ | ||||||
|     + '?{query}&format=json&pretty=0&no_redirect=1&d=1' |     + '?{query}&format=json&pretty=0&no_redirect=1&d=1' | ||||||
|  | |||||||
| @ -15,12 +15,12 @@ | |||||||
| 
 | 
 | ||||||
| from json import loads | from json import loads | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from searx.engines.xpath import extract_text |  | ||||||
| from searx.engines.duckduckgo import ( | from searx.engines.duckduckgo import ( | ||||||
|     _fetch_supported_languages, supported_languages_url, |     _fetch_supported_languages, supported_languages_url, | ||||||
|     get_region_code, language_aliases |     get_region_code, language_aliases | ||||||
| ) | ) | ||||||
| from searx.poolrequests import get | from searx.poolrequests import get | ||||||
|  | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['images'] | categories = ['images'] | ||||||
|  | |||||||
| @ -11,8 +11,7 @@ | |||||||
| from lxml import html, etree | from lxml import html, etree | ||||||
| import re | import re | ||||||
| from urllib.parse import quote, urljoin | from urllib.parse import quote, urljoin | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, eval_xpath | ||||||
| from searx.utils import eval_xpath |  | ||||||
| from searx import logger | from searx import logger | ||||||
| 
 | 
 | ||||||
| categories = ['general'] | categories = ['general'] | ||||||
|  | |||||||
| @ -11,8 +11,7 @@ | |||||||
| 
 | 
 | ||||||
| from lxml import html | from lxml import html | ||||||
| from urllib.parse import quote | from urllib.parse import quote | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, eval_xpath | ||||||
| from searx.utils import eval_xpath |  | ||||||
| 
 | 
 | ||||||
| categories = ['general'] | categories = ['general'] | ||||||
| paging = False | paging = False | ||||||
|  | |||||||
| @ -11,7 +11,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['files'] | categories = ['files'] | ||||||
|  | |||||||
| @ -13,7 +13,7 @@ | |||||||
| from html import escape | from html import escape | ||||||
| from urllib.parse import urljoin, urlencode | from urllib.parse import urljoin, urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['it'] | categories = ['it'] | ||||||
|  | |||||||
| @ -13,7 +13,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode, urljoin | from urllib.parse import urlencode, urljoin | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['it'] | categories = ['it'] | ||||||
|  | |||||||
| @ -21,9 +21,8 @@ Definitions`_. | |||||||
| from urllib.parse import urlencode, urlparse | from urllib.parse import urlencode, urlparse | ||||||
| from lxml import html | from lxml import html | ||||||
| from flask_babel import gettext | from flask_babel import gettext | ||||||
| from searx.engines.xpath import extract_text |  | ||||||
| from searx import logger | from searx import logger | ||||||
| from searx.utils import match_language, eval_xpath | from searx.utils import match_language, extract_text, eval_xpath | ||||||
| 
 | 
 | ||||||
| logger = logger.getChild('google engine') | logger = logger.getChild('google engine') | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -28,8 +28,7 @@ from urllib.parse import urlencode, urlparse, unquote | |||||||
| from lxml import html | from lxml import html | ||||||
| from flask_babel import gettext | from flask_babel import gettext | ||||||
| from searx import logger | from searx import logger | ||||||
| from searx.utils import eval_xpath | from searx.utils import extract_text, eval_xpath | ||||||
| from searx.engines.xpath import extract_text |  | ||||||
| 
 | 
 | ||||||
| # pylint: disable=unused-import | # pylint: disable=unused-import | ||||||
| from searx.engines.google import ( | from searx.engines.google import ( | ||||||
|  | |||||||
| @ -14,7 +14,7 @@ from datetime import date, timedelta | |||||||
| from json import loads | from json import loads | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| import re | import re | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
|  | |||||||
| @ -16,7 +16,7 @@ from urllib.parse import urlencode | |||||||
| from lxml import html | from lxml import html | ||||||
| from dateutil import parser | from dateutil import parser | ||||||
| from html.parser import HTMLParser | from html.parser import HTMLParser | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
|  | |||||||
| @ -13,8 +13,7 @@ | |||||||
| from lxml import html | from lxml import html | ||||||
| from operator import itemgetter | from operator import itemgetter | ||||||
| from urllib.parse import quote, urljoin | from urllib.parse import quote, urljoin | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size, convert_str_to_int | ||||||
| from searx.utils import get_torrent_size, convert_str_to_int |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['videos', 'music', 'files'] | categories = ['videos', 'music', 'files'] | ||||||
|  | |||||||
| @ -11,8 +11,7 @@ | |||||||
| 
 | 
 | ||||||
| from lxml import html | from lxml import html | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size, int_or_zero | ||||||
| from searx.utils import get_torrent_size, int_or_zero |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['files', 'images', 'videos', 'music'] | categories = ['files', 'images', 'videos', 'music'] | ||||||
|  | |||||||
| @ -13,8 +13,7 @@ from datetime import datetime | |||||||
| from operator import itemgetter | from operator import itemgetter | ||||||
| 
 | 
 | ||||||
| from urllib.parse import quote, urljoin | from urllib.parse import quote, urljoin | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size | ||||||
| from searx.utils import get_torrent_size |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ["videos", "music", "files"] | categories = ["videos", "music", "files"] | ||||||
|  | |||||||
| @ -12,7 +12,7 @@ from lxml import html | |||||||
| from json import loads | from json import loads | ||||||
| from operator import itemgetter | from operator import itemgetter | ||||||
| from urllib.parse import quote, urljoin | from urllib.parse import quote, urljoin | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| url = 'https://seedpeer.me/' | url = 'https://seedpeer.me/' | ||||||
|  | |||||||
| @ -12,7 +12,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode, urljoin | from urllib.parse import urlencode, urljoin | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['it'] | categories = ['it'] | ||||||
|  | |||||||
| @ -17,9 +17,8 @@ import re | |||||||
| from unicodedata import normalize, combining | from unicodedata import normalize, combining | ||||||
| from babel import Locale | from babel import Locale | ||||||
| from babel.localedata import locale_identifiers | from babel.localedata import locale_identifiers | ||||||
| from searx.engines.xpath import extract_text |  | ||||||
| from searx.languages import language_codes | from searx.languages import language_codes | ||||||
| from searx.utils import eval_xpath, match_language | from searx.utils import extract_text, eval_xpath, match_language | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['general'] | categories = ['general'] | ||||||
|  | |||||||
| @ -13,9 +13,8 @@ | |||||||
| import re | import re | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text |  | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from searx.utils import get_torrent_size, int_or_zero | from searx.utils import extract_text, get_torrent_size, int_or_zero | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['files', 'videos', 'music'] | categories = ['files', 'videos', 'music'] | ||||||
|  | |||||||
| @ -15,8 +15,7 @@ import re | |||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size | ||||||
| from searx.utils import get_torrent_size |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['files', 'videos', 'music'] | categories = ['files', 'videos', 'music'] | ||||||
|  | |||||||
| @ -15,7 +15,7 @@ | |||||||
| from urllib.parse import urlencode, urljoin | from urllib.parse import urlencode, urljoin | ||||||
| from lxml import html | from lxml import html | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['social media'] | categories = ['social media'] | ||||||
|  | |||||||
| @ -13,9 +13,8 @@ | |||||||
| 
 | 
 | ||||||
| from searx import logger | from searx import logger | ||||||
| from searx.poolrequests import get | from searx.poolrequests import get | ||||||
| from searx.engines.xpath import extract_text |  | ||||||
| from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url | from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url | ||||||
| from searx.utils import match_language, eval_xpath | from searx.utils import extract_text, match_language, eval_xpath | ||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from json import loads | from json import loads | ||||||
|  | |||||||
| @ -12,7 +12,7 @@ | |||||||
| 
 | 
 | ||||||
| from lxml import html | from lxml import html | ||||||
| from urllib.parse import urlencode, urljoin | from urllib.parse import urlencode, urljoin | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['images'] | categories = ['images'] | ||||||
|  | |||||||
| @ -1,7 +1,6 @@ | |||||||
| from urllib.parse import unquote, urlencode, urljoin, urlparse |  | ||||||
| from lxml import html | from lxml import html | ||||||
| from lxml.etree import _ElementStringResult, _ElementUnicodeResult | from urllib.parse import urlencode | ||||||
| from searx.utils import html_to_text, eval_xpath | from searx.utils import extract_text, extract_url, eval_xpath | ||||||
| 
 | 
 | ||||||
| search_url = None | search_url = None | ||||||
| url_xpath = None | url_xpath = None | ||||||
| @ -21,76 +20,6 @@ page_size = 1 | |||||||
| first_page_num = 1 | first_page_num = 1 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| ''' |  | ||||||
| if xpath_results is list, extract the text from each result and concat the list |  | ||||||
| if xpath_results is a xml element, extract all the text node from it |  | ||||||
|    ( text_content() method from lxml ) |  | ||||||
| if xpath_results is a string element, then it's already done |  | ||||||
| ''' |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def extract_text(xpath_results): |  | ||||||
|     if type(xpath_results) == list: |  | ||||||
|         # it's list of result : concat everything using recursive call |  | ||||||
|         result = '' |  | ||||||
|         for e in xpath_results: |  | ||||||
|             result = result + extract_text(e) |  | ||||||
|         return result.strip() |  | ||||||
|     elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]: |  | ||||||
|         # it's a string |  | ||||||
|         return ''.join(xpath_results) |  | ||||||
|     else: |  | ||||||
|         # it's a element |  | ||||||
|         text = html.tostring( |  | ||||||
|             xpath_results, encoding='unicode', method='text', with_tail=False |  | ||||||
|         ) |  | ||||||
|         text = text.strip().replace('\n', ' ') |  | ||||||
|         return ' '.join(text.split()) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def extract_url(xpath_results, search_url): |  | ||||||
|     if xpath_results == []: |  | ||||||
|         raise Exception('Empty url resultset') |  | ||||||
|     url = extract_text(xpath_results) |  | ||||||
| 
 |  | ||||||
|     if url.startswith('//'): |  | ||||||
|         # add http or https to this kind of url //example.com/ |  | ||||||
|         parsed_search_url = urlparse(search_url) |  | ||||||
|         url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url) |  | ||||||
|     elif url.startswith('/'): |  | ||||||
|         # fix relative url to the search engine |  | ||||||
|         url = urljoin(search_url, url) |  | ||||||
| 
 |  | ||||||
|     # fix relative urls that fall through the crack |  | ||||||
|     if '://' not in url: |  | ||||||
|         url = urljoin(search_url, url) |  | ||||||
| 
 |  | ||||||
|     # normalize url |  | ||||||
|     url = normalize_url(url) |  | ||||||
| 
 |  | ||||||
|     return url |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def normalize_url(url): |  | ||||||
|     parsed_url = urlparse(url) |  | ||||||
| 
 |  | ||||||
|     # add a / at this end of the url if there is no path |  | ||||||
|     if not parsed_url.netloc: |  | ||||||
|         raise Exception('Cannot parse url') |  | ||||||
|     if not parsed_url.path: |  | ||||||
|         url += '/' |  | ||||||
| 
 |  | ||||||
|     # FIXME : hack for yahoo |  | ||||||
|     if parsed_url.hostname == 'search.yahoo.com'\ |  | ||||||
|        and parsed_url.path.startswith('/r'): |  | ||||||
|         p = parsed_url.path |  | ||||||
|         mark = p.find('/**') |  | ||||||
|         if mark != -1: |  | ||||||
|             return unquote(p[mark + 3:]).decode() |  | ||||||
| 
 |  | ||||||
|     return url |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     query = urlencode({'q': query})[2:] |     query = urlencode({'q': query})[2:] | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -13,8 +13,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import unquote, urlencode | from urllib.parse import unquote, urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text, extract_url | from searx.utils import extract_text, extract_url, match_language, eval_xpath | ||||||
| from searx.utils import match_language, eval_xpath |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['general'] | categories = ['general'] | ||||||
|  | |||||||
| @ -13,12 +13,11 @@ import re | |||||||
| from datetime import datetime, timedelta | from datetime import datetime, timedelta | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text, extract_url |  | ||||||
| from searx.engines.yahoo import ( | from searx.engines.yahoo import ( | ||||||
|     parse_url, _fetch_supported_languages, supported_languages_url, language_aliases |     parse_url, _fetch_supported_languages, supported_languages_url, language_aliases | ||||||
| ) | ) | ||||||
| from dateutil import parser | from dateutil import parser | ||||||
| from searx.utils import match_language | from searx.utils import extract_text, extract_url, match_language | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['news'] | categories = ['news'] | ||||||
|  | |||||||
| @ -12,8 +12,7 @@ from lxml import html | |||||||
| from operator import itemgetter | from operator import itemgetter | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from urllib.parse import quote | from urllib.parse import quote | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size | ||||||
| from searx.utils import get_torrent_size |  | ||||||
| from searx.poolrequests import get as http_get | from searx.poolrequests import get as http_get | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
|  | |||||||
| @ -11,8 +11,7 @@ | |||||||
| from functools import reduce | from functools import reduce | ||||||
| from json import loads | from json import loads | ||||||
| from urllib.parse import quote_plus | from urllib.parse import quote_plus | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, list_get | ||||||
| from searx.utils import list_get |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['videos', 'music'] | categories = ['videos', 'music'] | ||||||
|  | |||||||
							
								
								
									
										232
									
								
								searx/utils.py
									
									
									
									
									
								
							
							
						
						
									
										232
									
								
								searx/utils.py
									
									
									
									
									
								
							| @ -10,9 +10,13 @@ from os.path import splitext, join | |||||||
| from io import open | from io import open | ||||||
| from random import choice | from random import choice | ||||||
| from html.parser import HTMLParser | from html.parser import HTMLParser | ||||||
| from lxml.etree import XPath | from urllib.parse import urljoin, urlparse, unquote | ||||||
|  | 
 | ||||||
|  | from lxml import html | ||||||
|  | from lxml.etree import XPath, _ElementStringResult, _ElementUnicodeResult | ||||||
| from babel.core import get_global | from babel.core import get_global | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| from searx import settings | from searx import settings | ||||||
| from searx.version import VERSION_STRING | from searx.version import VERSION_STRING | ||||||
| from searx.languages import language_codes | from searx.languages import language_codes | ||||||
| @ -35,12 +39,17 @@ lang_to_lc_cache = dict() | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def searx_useragent(): | def searx_useragent(): | ||||||
|  |     """Return the searx User Agent""" | ||||||
|     return 'searx/{searx_version} {suffix}'.format( |     return 'searx/{searx_version} {suffix}'.format( | ||||||
|            searx_version=VERSION_STRING, |            searx_version=VERSION_STRING, | ||||||
|            suffix=settings['outgoing'].get('useragent_suffix', '')) |            suffix=settings['outgoing'].get('useragent_suffix', '')) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def gen_useragent(os=None): | def gen_useragent(os=None): | ||||||
|  |     """Return a random browser User Agent | ||||||
|  | 
 | ||||||
|  |     See searx/data/useragents.json | ||||||
|  |     """ | ||||||
|     return str(useragents['ua'].format(os=os or choice(useragents['os']), version=choice(useragents['versions']))) |     return str(useragents['ua'].format(os=os or choice(useragents['os']), version=choice(useragents['versions']))) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -95,18 +104,156 @@ class HTMLTextExtractor(HTMLParser): | |||||||
|         return ''.join(self.result).strip() |         return ''.join(self.result).strip() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def html_to_text(html): | def html_to_text(html_str): | ||||||
|     html = html.replace('\n', ' ') |     """Extract text from a HTML string | ||||||
|     html = ' '.join(html.split()) | 
 | ||||||
|  |     Args: | ||||||
|  |         * html_str (str): string HTML | ||||||
|  | 
 | ||||||
|  |     Returns: | ||||||
|  |         * str: extracted text | ||||||
|  | 
 | ||||||
|  |     Examples: | ||||||
|  |         >>> html_to_text('Example <span id="42">#2</span>') | ||||||
|  |         'Example #2' | ||||||
|  | 
 | ||||||
|  |         >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>') | ||||||
|  |         'Example' | ||||||
|  |     """ | ||||||
|  |     html_str = html_str.replace('\n', ' ') | ||||||
|  |     html_str = ' '.join(html_str.split()) | ||||||
|     s = HTMLTextExtractor() |     s = HTMLTextExtractor() | ||||||
|     try: |     try: | ||||||
|         s.feed(html) |         s.feed(html_str) | ||||||
|     except HTMLTextExtractorException: |     except HTMLTextExtractorException: | ||||||
|         logger.debug("HTMLTextExtractor: invalid HTML\n%s", html) |         logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str) | ||||||
|     return s.get_text() |     return s.get_text() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def extract_text(xpath_results): | ||||||
|  |     """Extract text from a lxml result | ||||||
|  | 
 | ||||||
|  |       * if xpath_results is list, extract the text from each result and concat the list | ||||||
|  |       * if xpath_results is a xml element, extract all the text node from it | ||||||
|  |         ( text_content() method from lxml ) | ||||||
|  |       * if xpath_results is a string element, then it's already done | ||||||
|  |     """ | ||||||
|  |     if type(xpath_results) == list: | ||||||
|  |         # it's list of result : concat everything using recursive call | ||||||
|  |         result = '' | ||||||
|  |         for e in xpath_results: | ||||||
|  |             result = result + extract_text(e) | ||||||
|  |         return result.strip() | ||||||
|  |     elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]: | ||||||
|  |         # it's a string | ||||||
|  |         return ''.join(xpath_results) | ||||||
|  |     else: | ||||||
|  |         # it's a element | ||||||
|  |         text = html.tostring( | ||||||
|  |             xpath_results, encoding='unicode', method='text', with_tail=False | ||||||
|  |         ) | ||||||
|  |         text = text.strip().replace('\n', ' ') | ||||||
|  |         return ' '.join(text.split()) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def normalize_url(url, base_url): | ||||||
|  |     """Normalize URL: add protocol, join URL with base_url, add trailing slash if there is no path | ||||||
|  | 
 | ||||||
|  |     Args: | ||||||
|  |         * url (str): Relative URL | ||||||
|  |         * base_url (str): Base URL, it must be an absolute URL. | ||||||
|  | 
 | ||||||
|  |     Example: | ||||||
|  |         >>> normalize_url('https://example.com', 'http://example.com/') | ||||||
|  |         'https://example.com/' | ||||||
|  |         >>> normalize_url('//example.com', 'http://example.com/') | ||||||
|  |         'http://example.com/' | ||||||
|  |         >>> normalize_url('//example.com', 'https://example.com/') | ||||||
|  |         'https://example.com/' | ||||||
|  |         >>> normalize_url('/path?a=1', 'https://example.com') | ||||||
|  |         'https://example.com/path?a=1' | ||||||
|  |         >>> normalize_url('', 'https://example.com') | ||||||
|  |         'https://example.com/' | ||||||
|  |         >>> normalize_url('/test', '/path') | ||||||
|  |         raise Exception | ||||||
|  | 
 | ||||||
|  |     Raises: | ||||||
|  |         * lxml.etree.ParserError | ||||||
|  | 
 | ||||||
|  |     Returns: | ||||||
|  |         * str: normalized URL | ||||||
|  |     """ | ||||||
|  |     if url.startswith('//'): | ||||||
|  |         # add http or https to this kind of url //example.com/ | ||||||
|  |         parsed_search_url = urlparse(base_url) | ||||||
|  |         url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url) | ||||||
|  |     elif url.startswith('/'): | ||||||
|  |         # fix relative url to the search engine | ||||||
|  |         url = urljoin(base_url, url) | ||||||
|  | 
 | ||||||
|  |     # fix relative urls that fall through the crack | ||||||
|  |     if '://' not in url: | ||||||
|  |         url = urljoin(base_url, url) | ||||||
|  | 
 | ||||||
|  |     parsed_url = urlparse(url) | ||||||
|  | 
 | ||||||
|  |     # add a / at this end of the url if there is no path | ||||||
|  |     if not parsed_url.netloc: | ||||||
|  |         raise Exception('Cannot parse url') | ||||||
|  |     if not parsed_url.path: | ||||||
|  |         url += '/' | ||||||
|  | 
 | ||||||
|  |     return url | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def extract_url(xpath_results, base_url): | ||||||
|  |     """Extract and normalize URL from lxml Element | ||||||
|  | 
 | ||||||
|  |     Args: | ||||||
|  |         * xpath_results (Union[List[html.HtmlElement], html.HtmlElement]): lxml Element(s) | ||||||
|  |         * base_url (str): Base URL | ||||||
|  | 
 | ||||||
|  |     Example: | ||||||
|  |         >>> def f(s, search_url): | ||||||
|  |         >>>    return searx.utils.extract_url(html.fromstring(s), search_url) | ||||||
|  |         >>> f('<span id="42">https://example.com</span>', 'http://example.com/') | ||||||
|  |         'https://example.com/' | ||||||
|  |         >>> f('https://example.com', 'http://example.com/') | ||||||
|  |         'https://example.com/' | ||||||
|  |         >>> f('//example.com', 'http://example.com/') | ||||||
|  |         'http://example.com/' | ||||||
|  |         >>> f('//example.com', 'https://example.com/') | ||||||
|  |         'https://example.com/' | ||||||
|  |         >>> f('/path?a=1', 'https://example.com') | ||||||
|  |         'https://example.com/path?a=1' | ||||||
|  |         >>> f('', 'https://example.com') | ||||||
|  |         raise lxml.etree.ParserError | ||||||
|  |         >>> searx.utils.extract_url([], 'https://example.com') | ||||||
|  |         raise Exception | ||||||
|  | 
 | ||||||
|  |     Raises: | ||||||
|  |         * Exception | ||||||
|  |         * lxml.etree.ParserError | ||||||
|  | 
 | ||||||
|  |     Returns: | ||||||
|  |         * str: normalized URL | ||||||
|  |     """ | ||||||
|  |     if xpath_results == []: | ||||||
|  |         raise Exception('Empty url resultset') | ||||||
|  | 
 | ||||||
|  |     url = extract_text(xpath_results) | ||||||
|  |     return normalize_url(url, base_url) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def dict_subset(d, properties): | def dict_subset(d, properties): | ||||||
|  |     """Extract a subset of a dict | ||||||
|  | 
 | ||||||
|  |     Examples: | ||||||
|  |         >>> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'C']) | ||||||
|  |         {'A': 'a', 'C': 'c'} | ||||||
|  |         >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D']) | ||||||
|  |         {'A': 'a'} | ||||||
|  |     """ | ||||||
|     result = {} |     result = {} | ||||||
|     for k in properties: |     for k in properties: | ||||||
|         if k in d: |         if k in d: | ||||||
| @ -114,8 +261,19 @@ def dict_subset(d, properties): | |||||||
|     return result |     return result | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # get element in list or default value |  | ||||||
| def list_get(a_list, index, default=None): | def list_get(a_list, index, default=None): | ||||||
|  |     """Get element in list or default value | ||||||
|  | 
 | ||||||
|  |     Examples: | ||||||
|  |         >>> list_get(['A', 'B', 'C'], 0) | ||||||
|  |         'A' | ||||||
|  |         >>> list_get(['A', 'B', 'C'], 3) | ||||||
|  |         None | ||||||
|  |         >>> list_get(['A', 'B', 'C'], 3, 'default') | ||||||
|  |         'default' | ||||||
|  |         >>> list_get(['A', 'B', 'C'], -1) | ||||||
|  |         'C' | ||||||
|  |     """ | ||||||
|     if len(a_list) > index: |     if len(a_list) > index: | ||||||
|         return a_list[index] |         return a_list[index] | ||||||
|     else: |     else: | ||||||
| @ -123,6 +281,21 @@ def list_get(a_list, index, default=None): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_torrent_size(filesize, filesize_multiplier): | def get_torrent_size(filesize, filesize_multiplier): | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     Args: | ||||||
|  |         * filesize (str): size | ||||||
|  |         * filesize_multiplier (str): TB, GB, .... TiB, GiB... | ||||||
|  | 
 | ||||||
|  |     Returns: | ||||||
|  |         * int: number of bytes | ||||||
|  | 
 | ||||||
|  |     Example: | ||||||
|  |         >>> get_torrent_size('5', 'GB') | ||||||
|  |         5368709120 | ||||||
|  |         >>> get_torrent_size('3.14', 'MiB') | ||||||
|  |         3140000 | ||||||
|  |     """ | ||||||
|     try: |     try: | ||||||
|         filesize = float(filesize) |         filesize = float(filesize) | ||||||
| 
 | 
 | ||||||
| @ -149,14 +322,18 @@ def get_torrent_size(filesize, filesize_multiplier): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def convert_str_to_int(number_str): | def convert_str_to_int(number_str): | ||||||
|  |     """Convert number_str to int or 0 if number_str is not a number.""" | ||||||
|     if number_str.isdigit(): |     if number_str.isdigit(): | ||||||
|         return int(number_str) |         return int(number_str) | ||||||
|     else: |     else: | ||||||
|         return 0 |         return 0 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # convert a variable to integer or return 0 if it's not a number |  | ||||||
| def int_or_zero(num): | def int_or_zero(num): | ||||||
|  |     """Convert num to int or 0. num can be either a str or a list. | ||||||
|  |     If num is a list, the first element is converted to int (or return 0 if the list is empty). | ||||||
|  |     If num is a str, see convert_str_to_int | ||||||
|  |     """ | ||||||
|     if isinstance(num, list): |     if isinstance(num, list): | ||||||
|         if len(num) < 1: |         if len(num) < 1: | ||||||
|             return 0 |             return 0 | ||||||
| @ -165,6 +342,22 @@ def int_or_zero(num): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def is_valid_lang(lang): | def is_valid_lang(lang): | ||||||
|  |     """Return language code and name if lang describe a language. | ||||||
|  | 
 | ||||||
|  |     Examples: | ||||||
|  |         >>> is_valid_lang('zz') | ||||||
|  |         False | ||||||
|  |         >>> is_valid_lang('uk') | ||||||
|  |         (True, 'uk', 'ukrainian') | ||||||
|  |         >>> is_valid_lang(b'uk') | ||||||
|  |         (True, 'uk', 'ukrainian') | ||||||
|  |         >>> is_valid_lang('en') | ||||||
|  |         (True, 'en', 'english') | ||||||
|  |         >>> searx.utils.is_valid_lang('Español') | ||||||
|  |         (True, 'es', 'spanish') | ||||||
|  |         >>> searx.utils.is_valid_lang('Spanish') | ||||||
|  |         (True, 'es', 'spanish') | ||||||
|  |     """ | ||||||
|     if isinstance(lang, bytes): |     if isinstance(lang, bytes): | ||||||
|         lang = lang.decode() |         lang = lang.decode() | ||||||
|     is_abbr = (len(lang) == 2) |     is_abbr = (len(lang) == 2) | ||||||
| @ -192,8 +385,8 @@ def _get_lang_to_lc_dict(lang_list): | |||||||
|     return value |     return value | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # auxiliary function to match lang_code in lang_list |  | ||||||
| def _match_language(lang_code, lang_list=[], custom_aliases={}): | def _match_language(lang_code, lang_list=[], custom_aliases={}): | ||||||
|  |     """auxiliary function to match lang_code in lang_list""" | ||||||
|     # replace language code with a custom alias if necessary |     # replace language code with a custom alias if necessary | ||||||
|     if lang_code in custom_aliases: |     if lang_code in custom_aliases: | ||||||
|         lang_code = custom_aliases[lang_code] |         lang_code = custom_aliases[lang_code] | ||||||
| @ -215,8 +408,8 @@ def _match_language(lang_code, lang_list=[], custom_aliases={}): | |||||||
|     return _get_lang_to_lc_dict(lang_list).get(lang_code, None) |     return _get_lang_to_lc_dict(lang_list).get(lang_code, None) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # get the language code from lang_list that best matches locale_code |  | ||||||
| def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US'): | def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US'): | ||||||
|  |     """get the language code from lang_list that best matches locale_code""" | ||||||
|     # try to get language from given locale_code |     # try to get language from given locale_code | ||||||
|     language = _match_language(locale_code, lang_list, custom_aliases) |     language = _match_language(locale_code, lang_list, custom_aliases) | ||||||
|     if language: |     if language: | ||||||
| @ -258,6 +451,7 @@ def load_module(filename, module_dir): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def to_string(obj): | def to_string(obj): | ||||||
|  |     """Convert obj to its string representation.""" | ||||||
|     if isinstance(obj, str): |     if isinstance(obj, str): | ||||||
|         return obj |         return obj | ||||||
|     if isinstance(obj, Number): |     if isinstance(obj, Number): | ||||||
| @ -269,13 +463,19 @@ def to_string(obj): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def ecma_unescape(s): | def ecma_unescape(s): | ||||||
|     """ |     """Python implementation of the unescape javascript function | ||||||
|     python implementation of the unescape javascript function |  | ||||||
| 
 | 
 | ||||||
|     https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string |     https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string | ||||||
|     https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape |     https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape | ||||||
|  | 
 | ||||||
|  |     Examples: | ||||||
|  |         >>> ecma_unescape('%u5409') | ||||||
|  |         '吉' | ||||||
|  |         >>> ecma_unescape('%20') | ||||||
|  |         ' ' | ||||||
|  |         >>> ecma_unescape('%F3') | ||||||
|  |         'ó' | ||||||
|     """ |     """ | ||||||
|     # s = unicode(s) |  | ||||||
|     # "%u5409" becomes "吉" |     # "%u5409" becomes "吉" | ||||||
|     s = ecma_unescape4_re.sub(lambda e: chr(int(e.group(1), 16)), s) |     s = ecma_unescape4_re.sub(lambda e: chr(int(e.group(1), 16)), s) | ||||||
|     # "%20" becomes " ", "%F3" becomes "ó" |     # "%20" becomes " ", "%F3" becomes "ó" | ||||||
| @ -299,6 +499,11 @@ def get_engine_from_settings(name): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_xpath(xpath_str): | def get_xpath(xpath_str): | ||||||
|  |     """Return cached compiled XPath | ||||||
|  | 
 | ||||||
|  |     There is no thread lock. | ||||||
|  |     Worst case scenario, xpath_str is compiled more than one time. | ||||||
|  |     """ | ||||||
|     result = xpath_cache.get(xpath_str, None) |     result = xpath_cache.get(xpath_str, None) | ||||||
|     if result is None: |     if result is None: | ||||||
|         result = XPath(xpath_str) |         result = XPath(xpath_str) | ||||||
| @ -307,5 +512,6 @@ def get_xpath(xpath_str): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def eval_xpath(element, xpath_str): | def eval_xpath(element, xpath_str): | ||||||
|  |     """Equivalent of element.xpath(xpath_str) but compile xpath_str once for all.""" | ||||||
|     xpath = get_xpath(xpath_str) |     xpath = get_xpath(xpath_str) | ||||||
|     return xpath(element) |     return xpath(element) | ||||||
|  | |||||||
| @ -1,4 +1,7 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  | import lxml.etree | ||||||
|  | from lxml import html | ||||||
|  | 
 | ||||||
| from searx.testing import SearxTestCase | from searx.testing import SearxTestCase | ||||||
| from searx import utils | from searx import utils | ||||||
| 
 | 
 | ||||||
| @ -16,7 +19,30 @@ class TestUtils(SearxTestCase): | |||||||
|         self.assertTrue(utils.searx_useragent().startswith('searx')) |         self.assertTrue(utils.searx_useragent().startswith('searx')) | ||||||
| 
 | 
 | ||||||
|     def test_html_to_text(self): |     def test_html_to_text(self): | ||||||
|         html = """ |         html_str = """ | ||||||
|  |         <a href="/testlink" class="link_access_account"> | ||||||
|  |             <style> | ||||||
|  |                 .toto { | ||||||
|  |                     color: red; | ||||||
|  |                 } | ||||||
|  |             </style> | ||||||
|  |             <span class="toto"> | ||||||
|  |                 <span> | ||||||
|  |                     <img src="test.jpg" /> | ||||||
|  |                 </span> | ||||||
|  |             </span> | ||||||
|  |             <span class="titi"> | ||||||
|  |                             Test text | ||||||
|  |             </span> | ||||||
|  |             <script>value='dummy';</script> | ||||||
|  |         </a> | ||||||
|  |         """ | ||||||
|  |         self.assertIsInstance(utils.html_to_text(html_str), str) | ||||||
|  |         self.assertIsNotNone(utils.html_to_text(html_str)) | ||||||
|  |         self.assertEqual(utils.html_to_text(html_str), "Test text") | ||||||
|  | 
 | ||||||
|  |     def test_extract_text(self): | ||||||
|  |         html_str = """ | ||||||
|         <a href="/testlink" class="link_access_account"> |         <a href="/testlink" class="link_access_account"> | ||||||
|             <span class="toto"> |             <span class="toto"> | ||||||
|                 <span> |                 <span> | ||||||
| @ -28,9 +54,24 @@ class TestUtils(SearxTestCase): | |||||||
|             </span> |             </span> | ||||||
|         </a> |         </a> | ||||||
|         """ |         """ | ||||||
|         self.assertIsInstance(utils.html_to_text(html), str) |         dom = html.fromstring(html_str) | ||||||
|         self.assertIsNotNone(utils.html_to_text(html)) |         self.assertEqual(utils.extract_text(dom), 'Test text') | ||||||
|         self.assertEqual(utils.html_to_text(html), "Test text") |         self.assertEqual(utils.extract_text(dom.xpath('//span')), 'Test text') | ||||||
|  |         self.assertEqual(utils.extract_text(dom.xpath('//img/@src')), 'test.jpg') | ||||||
|  |         self.assertEqual(utils.extract_text(dom.xpath('//unexistingtag')), '') | ||||||
|  | 
 | ||||||
|  |     def test_extract_url(self): | ||||||
|  |         def f(html_str, search_url): | ||||||
|  |             return utils.extract_url(html.fromstring(html_str), search_url) | ||||||
|  |         self.assertEqual(f('<span id="42">https://example.com</span>', 'http://example.com/'), 'https://example.com/') | ||||||
|  |         self.assertEqual(f('https://example.com', 'http://example.com/'), 'https://example.com/') | ||||||
|  |         self.assertEqual(f('//example.com', 'http://example.com/'), 'http://example.com/') | ||||||
|  |         self.assertEqual(f('//example.com', 'https://example.com/'), 'https://example.com/') | ||||||
|  |         self.assertEqual(f('/path?a=1', 'https://example.com'), 'https://example.com/path?a=1') | ||||||
|  |         with self.assertRaises(lxml.etree.ParserError): | ||||||
|  |             f('', 'https://example.com') | ||||||
|  |         with self.assertRaises(Exception): | ||||||
|  |             utils.extract_url([], 'https://example.com') | ||||||
| 
 | 
 | ||||||
|     def test_html_to_text_invalid(self): |     def test_html_to_text_invalid(self): | ||||||
|         html = '<p><b>Lorem ipsum</i>dolor sit amet</p>' |         html = '<p><b>Lorem ipsum</i>dolor sit amet</p>' | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user