[mod] result.py: merge infobox URL and attributes when the same label or the same entity
entity are wikidata entity (like "Q42" for "Douglas Adams", see https://www.wikidata.org/wiki/Q42 )
This commit is contained in:
		
							parent
							
								
									23f4203dfb
								
							
						
					
					
						commit
						382fded665
					
				| @ -20,6 +20,18 @@ def result_content_len(content): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def compare_urls(url_a, url_b): | def compare_urls(url_a, url_b): | ||||||
|  |     """Lazy compare between two URL. | ||||||
|  |     "www.example.com" and "example.com" are equals. | ||||||
|  |     "www.example.com/path/" and "www.example.com/path" are equals. | ||||||
|  |     "https://www.example.com/" and "http://www.example.com/" are equals. | ||||||
|  | 
 | ||||||
|  |     Args: | ||||||
|  |         url_a (ParseResult): first URL | ||||||
|  |         url_b (ParseResult): second URL | ||||||
|  | 
 | ||||||
|  |     Returns: | ||||||
|  |         bool: True if url_a and url_b are equals | ||||||
|  |     """ | ||||||
|     # ignore www. in comparison |     # ignore www. in comparison | ||||||
|     if url_a.netloc.startswith('www.'): |     if url_a.netloc.startswith('www.'): | ||||||
|         host_a = url_a.netloc.replace('www.', '', 1) |         host_a = url_a.netloc.replace('www.', '', 1) | ||||||
| @ -68,8 +80,10 @@ def merge_two_infoboxes(infobox1, infobox2): | |||||||
|         for url2 in infobox2.get('urls', []): |         for url2 in infobox2.get('urls', []): | ||||||
|             unique_url = True |             unique_url = True | ||||||
|             parsed_url2 = urlparse(url2.get('url', '')) |             parsed_url2 = urlparse(url2.get('url', '')) | ||||||
|  |             entity_url2 = url2.get('entity') | ||||||
|             for url1 in urls1: |             for url1 in urls1: | ||||||
|                 if compare_urls(urlparse(url1.get('url', '')), parsed_url2): |                 if (entity_url2 is not None and url1.get('entity') == entity_url2)\ | ||||||
|  |                    or compare_urls(urlparse(url1.get('url', '')), parsed_url2): | ||||||
|                     unique_url = False |                     unique_url = False | ||||||
|                     break |                     break | ||||||
|             if unique_url: |             if unique_url: | ||||||
| @ -86,18 +100,22 @@ def merge_two_infoboxes(infobox1, infobox2): | |||||||
|             infobox1['img_src'] = img2 |             infobox1['img_src'] = img2 | ||||||
| 
 | 
 | ||||||
|     if 'attributes' in infobox2: |     if 'attributes' in infobox2: | ||||||
|         attributes1 = infobox1.get('attributes', None) |         attributes1 = infobox1.get('attributes') | ||||||
|         if attributes1 is None: |         if attributes1 is None: | ||||||
|             attributes1 = [] |             infobox1['attributes'] = attributes1 = [] | ||||||
|             infobox1['attributes'] = attributes1 |  | ||||||
| 
 | 
 | ||||||
|         attributeSet = set() |         attributeSet = set() | ||||||
|         for attribute in infobox1.get('attributes', []): |         for attribute in attributes1: | ||||||
|             if attribute.get('label', None) not in attributeSet: |             label = attribute.get('label') | ||||||
|                 attributeSet.add(attribute.get('label', None)) |             if label not in attributeSet: | ||||||
|  |                 attributeSet.add(label) | ||||||
|  |             entity = attribute.get('entity') | ||||||
|  |             if entity not in attributeSet: | ||||||
|  |                 attributeSet.add(entity) | ||||||
| 
 | 
 | ||||||
|         for attribute in infobox2.get('attributes', []): |         for attribute in infobox2.get('attributes', []): | ||||||
|             if attribute.get('label', None) not in attributeSet: |             if attribute.get('label') not in attributeSet\ | ||||||
|  |                and attribute.get('entity') not in attributeSet: | ||||||
|                 attributes1.append(attribute) |                 attributes1.append(attribute) | ||||||
| 
 | 
 | ||||||
|     if 'content' in infobox2: |     if 'content' in infobox2: | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user