From 6c421110b57c695e9c0a0d9212bc271d701d17da Mon Sep 17 00:00:00 2001
From: ahmad-alkadri <ahmad.alkadri@outlook.com>
Date: Sat, 14 Jan 2023 23:00:08 +0000
Subject: [PATCH 1/2] Perso branch - added cjk check and enclosement

---
 searx/webutils.py | 66 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 50 insertions(+), 16 deletions(-)

diff --git a/searx/webutils.py b/searx/webutils.py
index 35f4401d2..150b376fa 100644
--- a/searx/webutils.py
+++ b/searx/webutils.py
@@ -113,31 +113,65 @@ def prettify_url(url, max_length=74):
         return url
 
 
+def contains_cjko(s: str) -> bool:
+    """This function check whether or not a string contains Chinese, Japanese,
+    or Korean characters. It employs regex and uses the u escape sequence to
+    match any character in a set of Unicode ranges.
+
+    Args:
+        s (str): string to be checked.
+
+    Returns:
+        bool: True if the input s contains the characters and False otherwise.
+    """
+    unicode_ranges = ('\u4e00-\u9fff' # Chinese characters
+                      '\u3040-\u309f' # Japanese hiragana
+                      '\u30a0-\u30ff' # Japanese katakana
+                      '\u4e00-\u9faf' # Japanese kanji
+                      '\uac00-\ud7af' # Korean hangul syllables
+                      '\u1100-\u11ff' # Korean hangul jamo
+                      )
+    return bool(re.search(fr'[{unicode_ranges}]', s))
+
+
+def regex_highlight_cjk(word: str) -> str:
+    """Generate the regex pattern to match for a given word according
+    to whether or not the word contains CJK characters or not.
+    If the word is and/or contains CJK character, the regex pattern
+    will match standalone word by taking into account the presence
+    of whitespace before and after it; if not, it will match any presence
+    of the word throughout the text, ignoring the whitespace.
+
+    Args:
+        word (str): the word to be matched with regex pattern.
+
+    Returns:
+        str: the regex pattern for the word.
+    """
+    rword = re.escape(word)
+    if contains_cjko(rword):
+        return fr'({rword})'
+    else:
+        return fr'\b({rword})(?!\w)'
+
+
 def highlight_content(content, query):
 
     if not content:
         return None
+
     # ignoring html contents
     # TODO better html content detection
     if content.find('<') != -1:
         return content
 
-    if content.lower().find(query.lower()) > -1:
-        query_regex = '({0})'.format(re.escape(query))
-        content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U)
-    else:
-        regex_parts = []
-        for chunk in query.split():
-            chunk = chunk.replace('"', '')
-            if len(chunk) == 0:
-                continue
-            elif len(chunk) == 1:
-                regex_parts.append('\\W+{0}\\W+'.format(re.escape(chunk)))
-            else:
-                regex_parts.append('{0}'.format(re.escape(chunk)))
-        query_regex = '({0})'.format('|'.join(regex_parts))
-        content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U)
-
+    querysplit = query.split()
+    queries = []
+    for qs in querysplit:
+        queries.extend(re.findall(regex_highlight_cjk(qs), content, flags=re.I | re.U))
+    if len(queries) > 0:
+        for q in set(queries):
+            content = re.sub(regex_highlight_cjk(q), f'<span class="highlight">{q}</span>', content)
     return content
 
 

From 99b5272d9a17ffd813fc8c0b2f3cae3201d2398e Mon Sep 17 00:00:00 2001
From: ahmad-alkadri <ahmad.alkadri@outlook.com>
Date: Sun, 15 Jan 2023 15:08:11 +0000
Subject: [PATCH 2/2] A little fix and modified the testing for content
 highlight

---
 searx/webutils.py           | 19 +++++++++++--------
 tests/unit/test_webutils.py | 19 ++++++++++---------
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/searx/webutils.py b/searx/webutils.py
index 150b376fa..7b9a8045c 100644
--- a/searx/webutils.py
+++ b/searx/webutils.py
@@ -124,13 +124,14 @@ def contains_cjko(s: str) -> bool:
     Returns:
         bool: True if the input s contains the characters and False otherwise.
     """
-    unicode_ranges = ('\u4e00-\u9fff' # Chinese characters
-                      '\u3040-\u309f' # Japanese hiragana
-                      '\u30a0-\u30ff' # Japanese katakana
-                      '\u4e00-\u9faf' # Japanese kanji
-                      '\uac00-\ud7af' # Korean hangul syllables
-                      '\u1100-\u11ff' # Korean hangul jamo
-                      )
+    unicode_ranges = (
+        '\u4e00-\u9fff'  # Chinese characters
+        '\u3040-\u309f'  # Japanese hiragana
+        '\u30a0-\u30ff'  # Japanese katakana
+        '\u4e00-\u9faf'  # Japanese kanji
+        '\uac00-\ud7af'  # Korean hangul syllables
+        '\u1100-\u11ff'  # Korean hangul jamo
+    )
     return bool(re.search(fr'[{unicode_ranges}]', s))
 
 
@@ -168,7 +169,9 @@ def highlight_content(content, query):
     querysplit = query.split()
     queries = []
     for qs in querysplit:
-        queries.extend(re.findall(regex_highlight_cjk(qs), content, flags=re.I | re.U))
+        qs = qs.replace("'", "").replace('"', '').replace(" ", "")
+        if len(qs) > 0:
+            queries.extend(re.findall(regex_highlight_cjk(qs), content, flags=re.I | re.U))
     if len(queries) > 0:
         for q in set(queries):
             content = re.sub(regex_highlight_cjk(q), f'<span class="highlight">{q}</span>', content)
diff --git a/tests/unit/test_webutils.py b/tests/unit/test_webutils.py
index 31a0f86ce..acf1aeeb7 100644
--- a/tests/unit/test_webutils.py
+++ b/tests/unit/test_webutils.py
@@ -28,32 +28,33 @@ class TestWebUtils(SearxTestCase):
 
         content = 'a'
         query = 'test'
-        self.assertEqual(webutils.highlight_content(content, query), content)
+        self.assertEqual(webutils.highlight_content(content, query), 'a')
         query = 'a test'
-        self.assertEqual(webutils.highlight_content(content, query), content)
+        self.assertEqual(webutils.highlight_content(content, query), '<span class="highlight">a</span>')
 
         data = (
             ('" test "', 'a test string', 'a <span class="highlight">test</span> string'),
-            ('"a"', 'this is a test string', 'this is<span class="highlight"> a </span>test string'),
+            ('"a"', 'this is a test string', 'this is <span class="highlight">a</span> test string'),
             (
                 'a test',
                 'this is a test string that matches entire query',
-                'this is <span class="highlight">a test</span> string that matches entire query',
+                'this is <span class="highlight">a</span> <span class="highlight">test</span> string that matches entire query',
             ),
             (
                 'this a test',
                 'this is a string to test.',
                 (
-                    '<span class="highlight">this</span> is<span class="highlight"> a </span>'
-                    'string to <span class="highlight">test</span>.'
+                    '<span class="highlight">this</span> is <span class="highlight">a</span> string to <span class="highlight">test</span>.'
                 ),
             ),
             (
                 'match this "exact phrase"',
                 'this string contains the exact phrase we want to match',
-                (
-                    '<span class="highlight">this</span> string contains the <span class="highlight">exact</span>'
-                    ' <span class="highlight">phrase</span> we want to <span class="highlight">match</span>'
+                ''.join(
+                    [
+                        '<span class="highlight">this</span> string contains the <span class="highlight">exact</span> ',
+                        '<span class="highlight">phrase</span> we want to <span class="highlight">match</span>',
+                    ]
                 ),
             ),
         )