[fix] ignore scripts/styles in html_to_text
This commit is contained in:
		
							parent
							
								
									469e08881e
								
							
						
					
					
						commit
						1408859b4b
					
				| @ -23,6 +23,9 @@ ua_os = ('Windows NT 6.3; WOW64', | |||||||
| 
 | 
 | ||||||
| ua = "Mozilla/5.0 ({os}) Gecko/20100101 Firefox/{version}" | ua = "Mozilla/5.0 ({os}) Gecko/20100101 Firefox/{version}" | ||||||
| 
 | 
 | ||||||
|  | blocked_tags = ('script', | ||||||
|  |                 'style') | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def gen_useragent(): | def gen_useragent(): | ||||||
|     # TODO |     # TODO | ||||||
| @ -67,11 +70,29 @@ class HTMLTextExtractor(HTMLParser): | |||||||
|     def __init__(self): |     def __init__(self): | ||||||
|         HTMLParser.__init__(self) |         HTMLParser.__init__(self) | ||||||
|         self.result = [] |         self.result = [] | ||||||
|  |         self.tags = [] | ||||||
|  | 
 | ||||||
|  |     def handle_starttag(self, tag, attrs): | ||||||
|  |         print tag | ||||||
|  |         self.tags.append(tag) | ||||||
|  | 
 | ||||||
|  |     def handle_endtag(self, tag): | ||||||
|  |         print tag,tag | ||||||
|  |         if tag != self.tags[-1]: | ||||||
|  |             raise Exception("invalid html") | ||||||
|  |         self.tags.pop() | ||||||
|  | 
 | ||||||
|  |     def is_valid_tag(self): | ||||||
|  |         return not self.tags or self.tags[-1] not in blocked_tags | ||||||
| 
 | 
 | ||||||
|     def handle_data(self, d): |     def handle_data(self, d): | ||||||
|  |         if not self.is_valid_tag(): | ||||||
|  |             return | ||||||
|         self.result.append(d) |         self.result.append(d) | ||||||
| 
 | 
 | ||||||
|     def handle_charref(self, number): |     def handle_charref(self, number): | ||||||
|  |         if not self.is_valid_tag(): | ||||||
|  |             return | ||||||
|         if number[0] in (u'x', u'X'): |         if number[0] in (u'x', u'X'): | ||||||
|             codepoint = int(number[1:], 16) |             codepoint = int(number[1:], 16) | ||||||
|         else: |         else: | ||||||
| @ -79,6 +100,8 @@ class HTMLTextExtractor(HTMLParser): | |||||||
|         self.result.append(unichr(codepoint)) |         self.result.append(unichr(codepoint)) | ||||||
| 
 | 
 | ||||||
|     def handle_entityref(self, name): |     def handle_entityref(self, name): | ||||||
|  |         if not self.is_valid_tag(): | ||||||
|  |             return | ||||||
|         # codepoint = htmlentitydefs.name2codepoint[name] |         # codepoint = htmlentitydefs.name2codepoint[name] | ||||||
|         # self.result.append(unichr(codepoint)) |         # self.result.append(unichr(codepoint)) | ||||||
|         self.result.append(name) |         self.result.append(name) | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user