[fix] google engine: remove <script> tags from result items
In some results, Google returns a <script> tag that must be removed before extracting the content. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
		
							parent
							
								
									55481a6377
								
							
						
					
					
						commit
						1cdf0583da
					
				@ -59,11 +59,6 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
 | 
				
			|||||||
# specific xpath variables
 | 
					# specific xpath variables
 | 
				
			||||||
# ------------------------
 | 
					# ------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
results_xpath = './/div[contains(@jscontroller, "SC7lYd")]'
 | 
					 | 
				
			||||||
title_xpath = './/a/h3[1]'
 | 
					 | 
				
			||||||
href_xpath = './/a[h3]/@href'
 | 
					 | 
				
			||||||
content_xpath = './/div[contains(@data-sncf, "1")]'
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Suggestions are links placed in a *card-section*, we extract only the text
 | 
					# Suggestions are links placed in a *card-section*, we extract only the text
 | 
				
			||||||
# from the links not the links itself.
 | 
					# from the links not the links itself.
 | 
				
			||||||
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
 | 
					suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
 | 
				
			||||||
@ -345,22 +340,27 @@ def response(resp):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    # parse results
 | 
					    # parse results
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for result in eval_xpath_list(dom, results_xpath):  # pylint: disable=too-many-nested-blocks
 | 
					    for result in eval_xpath_list(dom, './/div[contains(@jscontroller, "SC7lYd")]'):
 | 
				
			||||||
 | 
					        # pylint: disable=too-many-nested-blocks
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
 | 
					            title_tag = eval_xpath_getindex(result, './/a/h3[1]', 0, default=None)
 | 
				
			||||||
            if title_tag is None:
 | 
					            if title_tag is None:
 | 
				
			||||||
                # this not one of the common google results *section*
 | 
					                # this not one of the common google results *section*
 | 
				
			||||||
                logger.debug('ignoring item from the result_xpath list: missing title')
 | 
					                logger.debug('ignoring item from the result_xpath list: missing title')
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
            title = extract_text(title_tag)
 | 
					            title = extract_text(title_tag)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            url = eval_xpath_getindex(result, href_xpath, 0, None)
 | 
					            url = eval_xpath_getindex(result, './/a[h3]/@href', 0, None)
 | 
				
			||||||
            if url is None:
 | 
					            if url is None:
 | 
				
			||||||
                logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
 | 
					                logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            content_nodes = eval_xpath(result, content_xpath)
 | 
					            content_nodes = eval_xpath(result, './/div[contains(@data-sncf, "1")]')
 | 
				
			||||||
 | 
					            for item in content_nodes:
 | 
				
			||||||
 | 
					                for script in item.xpath(".//script"):
 | 
				
			||||||
 | 
					                    script.getparent().remove(script)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            content = extract_text(content_nodes)
 | 
					            content = extract_text(content_nodes)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if not content:
 | 
					            if not content:
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
		Reference in New Issue
	
	Block a user