# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring,disable=missing-class-docstring,invalid-name
import random
import string
import lxml . etree
from lxml import html
from parameterized . parameterized import parameterized
from searx . exceptions import SearxXPathSyntaxException , SearxEngineXPathException
from searx import utils
from tests import SearxTestCase
def random_string ( length , choices = string . ascii_letters ) :
return ' ' . join ( random . choice ( choices ) for _ in range ( length ) )
class TestUtils ( SearxTestCase ) :
def test_gen_useragent ( self ) :
self . assertIsInstance ( utils . gen_useragent ( ) , str )
self . assertIsNotNone ( utils . gen_useragent ( ) )
self . assertTrue ( utils . gen_useragent ( ) . startswith ( ' Mozilla ' ) )
def test_searx_useragent ( self ) :
self . assertIsInstance ( utils . searx_useragent ( ) , str )
self . assertIsNotNone ( utils . searx_useragent ( ) )
self . assertTrue ( utils . searx_useragent ( ) . startswith ( ' searx ' ) )
def test_html_to_text ( self ) :
html_str = """
<a href= " /testlink " class= " link_access_account " >
<style>
.toto {
color: red;
}
</style>
<span class= " toto " >
<span>
<img src= " test.jpg " />
</span>
</span>
<span class= " titi " >
Test text
</span>
<script>value= ' dummy ' ;</script>
</a>
"""
self . assertIsInstance ( utils . html_to_text ( html_str ) , str )
self . assertIsNotNone ( utils . html_to_text ( html_str ) )
self . assertEqual ( utils . html_to_text ( html_str ) , " Test text " )
self . assertEqual ( utils . html_to_text ( r " regexp: (?<![a-zA-Z] " ) , " regexp: (?<![a-zA-Z] " )
def test_extract_text ( self ) :
html_str = """
<a href= " /testlink " class= " link_access_account " >
<span class= " toto " >
<span>
<img src= " test.jpg " />
</span>
</span>
<span class= " titi " >
Test text
</span>
</a>
"""
dom = html . fromstring ( html_str )
self . assertEqual ( utils . extract_text ( dom ) , ' Test text ' )
self . assertEqual ( utils . extract_text ( dom . xpath ( ' //span ' ) ) , ' Test text ' )
self . assertEqual ( utils . extract_text ( dom . xpath ( ' //span/text() ' ) ) , ' Test text ' )
self . assertEqual ( utils . extract_text ( dom . xpath ( ' count(//span) ' ) ) , ' 3.0 ' )
self . assertEqual ( utils . extract_text ( dom . xpath ( ' boolean(//span) ' ) ) , ' True ' )
self . assertEqual ( utils . extract_text ( dom . xpath ( ' //img/@src ' ) ) , ' test.jpg ' )
self . assertEqual ( utils . extract_text ( dom . xpath ( ' //unexistingtag ' ) ) , ' ' )
def test_extract_text_allow_none ( self ) :
self . assertEqual ( utils . extract_text ( None , allow_none = True ) , None )
def test_extract_text_error_none ( self ) :
with self . assertRaises ( ValueError ) :
utils . extract_text ( None )
def test_extract_text_error_empty ( self ) :
with self . assertRaises ( ValueError ) :
utils . extract_text ( { } )
def test_extract_url ( self ) :
def f ( html_str , search_url ) :
return utils . extract_url ( html . fromstring ( html_str ) , search_url )
self . assertEqual ( f ( ' <span id= " 42 " >https://example.com</span> ' , ' http://example.com/ ' ) , ' https://example.com/ ' )
self . assertEqual ( f ( ' https://example.com ' , ' http://example.com/ ' ) , ' https://example.com/ ' )
self . assertEqual ( f ( ' //example.com ' , ' http://example.com/ ' ) , ' http://example.com/ ' )
self . assertEqual ( f ( ' //example.com ' , ' https://example.com/ ' ) , ' https://example.com/ ' )
self . assertEqual ( f ( ' /path?a=1 ' , ' https://example.com ' ) , ' https://example.com/path?a=1 ' )
with self . assertRaises ( lxml . etree . ParserError ) :
f ( ' ' , ' https://example.com ' )
with self . assertRaises ( Exception ) :
utils . extract_url ( [ ] , ' https://example.com ' )
def test_html_to_text_invalid ( self ) :
_html = ' <p><b>Lorem ipsum</i>dolor sit amet</p> '
self . assertEqual ( utils . html_to_text ( _html ) , " Lorem ipsum " )
def test_ecma_unscape ( self ) :
self . assertEqual ( utils . ecma_unescape ( ' text % 20with %20s pace ' ) , ' text with space ' )
self . assertEqual ( utils . ecma_unescape ( ' text using %x x: %F 3 ' ) , ' text using %x x: ó ' )
self . assertEqual ( utils . ecma_unescape ( ' text using %u : %u 5409, %u 4E16 %u 754c ' ) , ' text using %u : 吉, 世界 ' )
class TestHTMLTextExtractor ( SearxTestCase ) : # pylint: disable=missing-class-docstring
def setUp ( self ) :
super ( ) . setUp ( )
self . html_text_extractor = utils . _HTMLTextExtractor ( ) # pylint: disable=protected-access
def test__init__ ( self ) :
self . assertEqual ( self . html_text_extractor . result , [ ] )
@parameterized.expand (
[
( ' xF ' , ' \x0f ' ) ,
( ' XF ' , ' \x0f ' ) ,
( ' 97 ' , ' a ' ) ,
]
)
def test_handle_charref ( self , charref : str , expected : str ) :
self . html_text_extractor . handle_charref ( charref )
self . assertIn ( expected , self . html_text_extractor . result )
def test_handle_entityref ( self ) :
entity = ' test '
self . html_text_extractor . handle_entityref ( entity )
self . assertIn ( entity , self . html_text_extractor . result )
def test_invalid_html ( self ) :
text = ' <p><b>Lorem ipsum</i>dolor sit amet</p> '
with self . assertRaises ( utils . _HTMLTextExtractorException ) : # pylint: disable=protected-access
self . html_text_extractor . feed ( text )
class TestXPathUtils ( SearxTestCase ) : # pylint: disable=missing-class-docstring
TEST_DOC = """ <ul>
<li>Text in <b>bold</b> and <i>italic</i> </li>
<li>Another <b>text</b> <img src= " data:image/gif;base64,R0lGODlhAQABAIAAAAUEBAAAACwAAAAAAQABAAACAkQBADs= " ></li>
</ul> """
def test_get_xpath_cache ( self ) :
xp1 = utils . get_xpath ( ' //a ' )
xp2 = utils . get_xpath ( ' //div ' )
xp3 = utils . get_xpath ( ' //a ' )
self . assertEqual ( id ( xp1 ) , id ( xp3 ) )
self . assertNotEqual ( id ( xp1 ) , id ( xp2 ) )
def test_get_xpath_type ( self ) :
utils . get_xpath ( lxml . etree . XPath ( ' //a ' ) )
with self . assertRaises ( TypeError ) :
utils . get_xpath ( [ ] )
def test_get_xpath_invalid ( self ) :
invalid_xpath = ' //a[0].text '
with self . assertRaises ( SearxXPathSyntaxException ) as context :
utils . get_xpath ( invalid_xpath )
self . assertEqual ( context . exception . message , ' Invalid expression ' )
self . assertEqual ( context . exception . xpath_str , invalid_xpath )
def test_eval_xpath_unregistered_function ( self ) :
doc = html . fromstring ( TestXPathUtils . TEST_DOC )
invalid_function_xpath = ' int(//a) '
with self . assertRaises ( SearxEngineXPathException ) as context :
utils . eval_xpath ( doc , invalid_function_xpath )
self . assertEqual ( context . exception . message , ' Unregistered function ' )
self . assertEqual ( context . exception . xpath_str , invalid_function_xpath )
def test_eval_xpath ( self ) :
doc = html . fromstring ( TestXPathUtils . TEST_DOC )
self . assertEqual ( utils . eval_xpath ( doc , ' //p ' ) , [ ] )
self . assertEqual ( utils . eval_xpath ( doc , ' //i/text() ' ) , [ ' italic ' ] )
self . assertEqual ( utils . eval_xpath ( doc , ' count(//i) ' ) , 1.0 )
def test_eval_xpath_list ( self ) :
doc = html . fromstring ( TestXPathUtils . TEST_DOC )
# check a not empty list
self . assertEqual ( utils . eval_xpath_list ( doc , ' //i/text() ' ) , [ ' italic ' ] )
# check min_len parameter
with self . assertRaises ( SearxEngineXPathException ) as context :
utils . eval_xpath_list ( doc , ' //p ' , min_len = 1 )
self . assertEqual ( context . exception . message , ' len(xpath_str) < 1 ' )
self . assertEqual ( context . exception . xpath_str , ' //p ' )
def test_eval_xpath_getindex ( self ) :
doc = html . fromstring ( TestXPathUtils . TEST_DOC )
# check index 0
self . assertEqual ( utils . eval_xpath_getindex ( doc , ' //i/text() ' , 0 ) , ' italic ' )
# default is 'something'
self . assertEqual ( utils . eval_xpath_getindex ( doc , ' //i/text() ' , 1 , default = ' something ' ) , ' something ' )
# default is None
self . assertIsNone ( utils . eval_xpath_getindex ( doc , ' //i/text() ' , 1 , default = None ) )
# index not found
with self . assertRaises ( SearxEngineXPathException ) as context :
utils . eval_xpath_getindex ( doc , ' //i/text() ' , 1 )
self . assertEqual ( context . exception . message , ' index 1 not found ' )
# not a list
with self . assertRaises ( SearxEngineXPathException ) as context :
utils . eval_xpath_getindex ( doc , ' count(//i) ' , 1 )
self . assertEqual ( context . exception . message , ' the result is not a list ' )
def test_detect_language ( self ) :
# make sure new line are not an issue
# fasttext.predict('') does not accept new line.
l = utils . detect_language ( ' The quick brown fox jumps over \n the lazy dog ' )
self . assertEqual ( l , ' en ' )
l = utils . detect_language (
' いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす '
)
self . assertEqual ( l , ' ja ' )
l = utils . detect_language ( ' Pijamalı hasta yağı z şoföre çabucak güvendi. ' )
self . assertEqual ( l , ' tr ' )
l = utils . detect_language ( ' ' )
self . assertIsNone ( l )
# mix languages --> None
l = utils . detect_language ( ' The いろはにほへと Pijamalı ' )
self . assertIsNone ( l )
with self . assertRaises ( ValueError ) :
utils . detect_language ( None ) # type: ignore