#!/usr/bin/env Python """HTML Parser and Filter Copyright 2001-2003 by Andrew Shearer. Dual-licensed under the Python license and MPSL 1.1. For current version, see: http://www.shearersoftware.com/software/developers/htmlfilter/ or contact: mailto:ashearerw at shearersoftware.com Version history: 1.1 2003-09-28 Minor comment cleanup of HTMLDecode 1.1b1 2003-05-26 Added hex entity decoding, expanded docstring for HTMLDecode. 1.1a1 2003-01-10 Added encoding support (Unicode) 1.0b7 2002-10-27 Python 1.5 compatibility (removed string instance methods, +=) 1.0b6 2002-07-02 Minor optimizations 1.0b5 2002-05-29 Bug fix for tag.setAttribute(name, None) 1.0b4 2001-10-28 Added HTMLTag.hasAttribute() method; renamed handleScript to handleScriptOrStyle """ __author__ = "Andrew Shearer" __version__ = "1.1" import re import string def HTMLDecode(value): """Decoder for strings containing HTML entities. (">" becomes ">", etc.) Parses numeric entities in hex and decimal, as well as all entities listed in Python's standard htmlentitydefs module. On a Unicode-capable Python, the input and output are assumed to be Unicode strings. (This is because Python complains when concatenating Unicode strings with non-Unicode strings containing characters outside 7-bit ASCII. Numeric entities are converted to Unicode characters, so the combination of those and 8-bit input strings would result in an error.) CHECK: should newlines & whitespace be collapsed? This would reduce the fidelity of attribute values--bad for form element preset values, where browsers tend to respect whitespace. """ entityStart = string.find(value, '&') if entityStart != -1: # only run bulk of code if there are entities present preferUnicodeToISO8859 = 1 #(outputEncoding is not 'iso-8859-1') prevOffset = 0 valueParts = [] import htmlentitydefs while entityStart != -1: valueParts.append(value[prevOffset:entityStart]) entityEnd = string.find(value, ';', entityStart+1) if entityEnd == -1: entityEnd = entityStart entity = '&' else: entity = value[entityStart:entityEnd+1] if len(entity) < 4 or entity[1] != '#': entity = htmlentitydefs.entitydefs.get(entity[1:-1],entity) if len(entity) == 1: if preferUnicodeToISO8859 and ord(entity) > 127 and hasattr(entity, 'decode'): entity = entity.decode('iso-8859-1') else: if len(entity) >= 4 and entity[1] == '#': if entity[2] in ('X','x'): entityCode = int(entity[3:-1], 16) else: entityCode = int(entity[2:-1]) if entityCode > 255: entity = unichr(entityCode) else: entity = chr(entityCode) if preferUnicodeToISO8859 and hasattr(entity, 'decode'): entity = entity.decode('iso-8859-1') valueParts.append(entity) prevOffset = entityEnd+1 entityStart = string.find(value, '&', prevOffset) valueParts.append(value[prevOffset:]) value = string.join(valueParts, '') return value def HTMLEncode(value): """Encode HTML entities. Output is in Unicode only if this input is in Unicode.""" from cgi import escape value = escape(value, 1) return value class HTMLFilter: """Parse an HTML 4 document, for subclasses to pass through or modify. Subclasses can output an exact replica of the original or modify specific elements or attributes. Normally, a user of this class would create a subclass that did some specific filtering, call feedString(originalHTML), then call close(). The subclass would override the handleXXX methods to perform the filtering, and override collectHTML() if it wanted to store the generated data. Subclasses that only wanted to read the file and not output a modified version wouldn't need to override collectHTML(). The handleXXX methods are overridden through subclassing the main HTMLFilter class, rather than implementing some kind of HTMLHandler interface, so that new handleXXX methods can be added to this base class with default implementations that provide backwards compatibility. (+++: could split off HTMLHandler class if it were used as a base class.) Data flow through HTMLFilter methods: feedString(originalHTML) -> multiple calls to handle[Text|Tag|Script|Comment|...](tag...) (subclasses will override to observe or modify the HTML code) -> collectHTML(html) (subclasses can store the pieces of the final HTML code) Has partial support of server-side scripting tags (ASP, PHP, JSP)-- they work anywhere an HTML tag would work, but HTML tags with embedded code may not be parseable (for instance, if a tag contains ASP code inside an attribute value, subclasses can only reliably pass the whole tag through unmodified, not read or modify the attributes). Does not support SGML short tag forms (which aren't normally used or parsed in HTML anyway, and the HTML RFC warns about this). If a subclass doesn't override a handleXXX method, the default implementations will pass the data to collectHTML() so that the original HTML code is preserved. New handleXXX methods added in the future will therefore be backwards compatible with older sublcasses, so that file filters never lose text. """ # regexps from sgmllib tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9]*') lineterminatorfind = re.compile('\\n|\\r\\n?') class HTMLTag: """Represents an HTML tag, allowing attribute retrieval and modification. After modification, getHTML() returns the complete new tag. Parsing within the tag is done lazily on the first call to getAttribute or setAttribute, so this class is very lightweight if all you need is getName or getHTML. Attributes and tag names are case-insensitive but case-preserving. In fact, this class does all it can to avoid changing the case and whitespace in the tag. When changing an existing attribute value, though, this class always encloses the new value in double quotes (even if the original value had single or no quotes). Attribute storage: Each attribute is stored as a tuple of offsets into the main self.attrsText string. (attributeStart, equalsStart, valueStart, valueLimit, attributeLimit) attributeStart points to whitespace, and there is no whitespace directly preceding attributeLimit. In other words, whitespace between attributes, or between the tag name and first attribute, belongs to the beginning of the trailing attribute. This ensures that deleting attributes won't leave any extra whitespace before the closing >. If the attribute value is quoted, attributeLimit == valueLimit + 1. If there is no value, valueStart == valueLimit == attributeLimit. If there is no equals sign either (common for boolean attributes), equalsStart == attributeLimit. TODO: is whitespace required between quoted attributes? Implement dictionary methods for len(), items(), values(), keys() """ attrfind = re.compile( '[%s]*([a-zA-Z_][-.a-zA-Z_0-9]*)' % string.whitespace + ('([%s]*=[%s]*' % (string.whitespace, string.whitespace)) + r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:+*%?!\(\)_#=~]*))?') def __init__(self, tagName, tagAttrs = ''): # example: tagName='input', tagAttrs=' type="text"' self.name = tagName self.attrsText = tagAttrs self._attributeCache = None def getName(self): """Return the name of the tag, for instance, 'BODY' or 'p'.""" return self.name def setName(self, name): """Change the name of the tag. Use getHTML() to return the text of the changed tag.""" self.name = name def getHTML(self): """Reconstruct the complete tag, including any changed attributes.""" return '<' + self.name + self.attrsText + '>' def hasAttribute(self, name): """Return 1 if the named attribute exists, None if it does not. An attribute value of the empty string is still treated as existent. """ textRange = self._readAttributeCache(name) if textRange: return 1 else: return None def getAttribute(self, name, htmldecode = 1): """Return the value of the indicated tag attribute. If the attribute doesn't exist, return None. The value is HTML entity-decoded by default, which is recommended if the value will be processed before being put back into another attribute (for example, when modifying a URL, the & entities must be decoded first). For boolean attributes, the "expanded" value is returned: returns 'checked' as the value of the 'checked' attribute. """ textRange = self._readAttributeCache(name) if textRange: (attributeStart, equalsStart, valueStart, valueLimit, attributeLimit) = textRange if equalsStart != attributeLimit: # has equals sign and therefore a value value = self.attrsText[valueStart:valueLimit] # strip leading & trailing whitespace, per RFC, # and run very simple entity decoder if htmldecode: value = string.strip(HTMLDecode(value)) else: # no equals sign; expand boolean default value shorthand # (per RFC); returns # 'checked' as the value of the 'checked' attr # Note: the case of the original attribute is not preserved, # since this method only knows the case requested value = name else: # attribute not found, return None value = None return value def __getitem__(self, name): """Redirect Python's [] operator to call getAttribute(). HTML entities in the value are decoded before being returned. Example: value = tag['bgcolor'] """ return self.getAttribute(name) # +++ error if not found? def __setitem__(self, name, value): """Redirect assignment through [] to call setAttribute(). The value is HTML entity-encoded before being inserted. Example: tag['bgcolor'] = '#FFFFFF' """ return self.setAttribute(name, value) def __delitem__(self, name): """Redirect deletion to call deleteAttribute(). Example: del tag['bgcolor'] """ return self.deleteAttribute(name) def setBooleanAttribute(self, name, bool): """Set a boolean attribute to true or false. This controls whether the attribute exists, and the value and equals sign are always omitted. Example: