From: Magnus Hagander Date: Fri, 30 Nov 2018 02:36:02 +0000 (+0100) Subject: Switch to using tidylib rather than tidy X-Git-Url: http://git.postgresql.org/gitweb/static/session/%7B%7Bsession.id%7D%7D-%7B%7Bsession.title%7Cslugify%7D%7D?a=commitdiff_plain;h=33907ea5a6f953ca679196f491edfff42173ad3e;p=pgarchives.git Switch to using tidylib rather than tidy tidylib (http://countergram.github.io/pytidylib/) is maintained, the old tidy one (https://cihar.com/software/utidylib/) is not. And in particular, python3 support is in the new one. Generates some minor changs in the existing archives, but it seems to be just whitespace and some actual incorrectness in the old output. --- diff --git a/loader/lib/parser.py b/loader/lib/parser.py index 871cea7..51fb26d 100644 --- a/loader/lib/parser.py +++ b/loader/lib/parser.py @@ -6,7 +6,7 @@ from email.parser import Parser from email.header import decode_header from email.errors import HeaderParseError from HTMLParser import HTMLParser, HTMLParseError -import tidy +import tidylib import StringIO from lib.exception import IgnorableException @@ -204,7 +204,7 @@ class ArchivesParser(object): if b: b = self.html_clean(b) if b: return b - if b == '': + if b == '' or b is None: hasempty = True if hasempty: @@ -483,7 +483,24 @@ class ArchivesParser(object): def html_clean(self, html): # First we pass it through tidy - html = unicode(str(tidy.parseString(html.encode('utf8'), drop_proprietary_attributes=1, alt_text='',hide_comments=1,output_xhtml=1,show_body_only=1,clean=1,char_encoding='utf8')), 'utf8') + (html, errors) = tidylib.tidy_document(html, + options={ + 'drop-proprietary-attributes': 1, + 'alt-text': '', + 'hide-comments': 1, + 'output-xhtml': 1, + 'show-body-only': 1, + 'clean': 1, + 'char-encoding': 'utf8', + 'show-warnings': 0, + 'show-info': 0, + }) + if errors: + print("HTML tidy failed for %s!" % self.msgid) + print(errors) + return None + if type(html) == str: + html = unicode(html, 'utf8') try: cleaner = HTMLCleaner()