# markdown/searializers.py # # Add x/html serialization to Elementree # Taken from ElementTree 1.3 preview with slight modifications # # Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved. # # fredrik@pythonware.com # http://www.pythonware.com # # -------------------------------------------------------------------- # The ElementTree toolkit is # # Copyright (c) 1999-2007 by Fredrik Lundh # # By obtaining, using, and/or copying this software and/or its # associated documentation, you agree that you have read, understood, # and will comply with the following terms and conditions: # # Permission to use, copy, modify, and distribute this software and # its associated documentation for any purpose and without fee is # hereby granted, provided that the above copyright notice appears in # all copies, and that both that copyright notice and this permission # notice appear in supporting documentation, and that the name of # Secret Labs AB or the author not be used in advertising or publicity # pertaining to distribution of the software without specific, written # prior permission. # # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE # OF THIS SOFTWARE. # -------------------------------------------------------------------- from __future__ import absolute_import from __future__ import unicode_literals from . import util ElementTree = util.etree.ElementTree QName = util.etree.QName if hasattr(util.etree, 'test_comment'): # pragma: no cover Comment = util.etree.test_comment else: # pragma: no cover Comment = util.etree.Comment PI = util.etree.PI ProcessingInstruction = util.etree.ProcessingInstruction __all__ = ['to_html_string', 'to_xhtml_string'] HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr", "img", "input", "isindex", "link", "meta" "param") try: HTML_EMPTY = set(HTML_EMPTY) except NameError: # pragma: no cover pass _namespace_map = { # "well-known" namespace prefixes "http://www.w3.org/XML/1998/namespace": "xml", "http://www.w3.org/1999/xhtml": "html", "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", "http://schemas.xmlsoap.org/wsdl/": "wsdl", # xml schema "http://www.w3.org/2001/XMLSchema": "xs", "http://www.w3.org/2001/XMLSchema-instance": "xsi", # dublic core "http://purl.org/dc/elements/1.1/": "dc", } def _raise_serialization_error(text): # pragma: no cover raise TypeError( "cannot serialize %r (type %s)" % (text, type(text).__name__) ) def _encode(text, encoding): try: return text.encode(encoding, "xmlcharrefreplace") except (TypeError, AttributeError): # pragma: no cover _raise_serialization_error(text) def _escape_cdata(text): # escape character data try: # it's worth avoiding do-nothing calls for strings that are # shorter than 500 character, or so. assume that's, by far, # the most common case in most applications. if "&" in text: text = text.replace("&", "&") if "<" in text: text = text.replace("<", "<") if ">" in text: text = text.replace(">", ">") return text except (TypeError, AttributeError): # pragma: no cover _raise_serialization_error(text) def _escape_attrib(text): # escape attribute value try: if "&" in text: text = text.replace("&", "&") if "<" in text: text = text.replace("<", "<") if ">" in text: text = text.replace(">", ">") if "\"" in text: text = text.replace("\"", """) if "\n" in text: text = text.replace("\n", " ") return text except (TypeError, AttributeError): # pragma: no cover _raise_serialization_error(text) def _escape_attrib_html(text): # escape attribute value try: if "&" in text: text = text.replace("&", "&") if "<" in text: text = text.replace("<", "<") if ">" in text: text = text.replace(">", ">") if "\"" in text: text = text.replace("\"", """) return text except (TypeError, AttributeError): # pragma: no cover _raise_serialization_error(text) def _serialize_html(write, elem, qnames, namespaces, format): tag = elem.tag text = elem.text if tag is Comment: write("" % _escape_cdata(text)) elif tag is ProcessingInstruction: write("" % _escape_cdata(text)) else: tag = qnames[tag] if tag is None: if text: write(_escape_cdata(text)) for e in elem: _serialize_html(write, e, qnames, None, format) else: write("<" + tag) items = elem.items() if items or namespaces: items = sorted(items) # lexical order for k, v in items: if isinstance(k, QName): k = k.text if isinstance(v, QName): v = qnames[v.text] else: v = _escape_attrib_html(v) if qnames[k] == v and format == 'html': # handle boolean attributes write(" %s" % v) else: write(" %s=\"%s\"" % (qnames[k], v)) if namespaces: items = namespaces.items() items.sort(key=lambda x: x[1]) # sort on prefix for v, k in items: if k: k = ":" + k write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v))) if format == "xhtml" and tag.lower() in HTML_EMPTY: write(" />") else: write(">") if text: if tag.lower() in ["script", "style"]: write(text) else: write(_escape_cdata(text)) for e in elem: _serialize_html(write, e, qnames, None, format) if tag.lower() not in HTML_EMPTY: write("") if elem.tail: write(_escape_cdata(elem.tail)) def _write_html(root, encoding=None, default_namespace=None, format="html"): assert root is not None data = [] write = data.append qnames, namespaces = _namespaces(root, default_namespace) _serialize_html(write, root, qnames, namespaces, format) if encoding is None: return "".join(data) else: return _encode("".join(data)) # -------------------------------------------------------------------- # serialization support def _namespaces(elem, default_namespace=None): # identify namespaces used in this tree # maps qnames to *encoded* prefix:local names qnames = {None: None} # maps uri:s to prefixes namespaces = {} if default_namespace: namespaces[default_namespace] = "" def add_qname(qname): # calculate serialized qname representation try: if qname[:1] == "{": uri, tag = qname[1:].split("}", 1) prefix = namespaces.get(uri) if prefix is None: prefix = _namespace_map.get(uri) if prefix is None: prefix = "ns%d" % len(namespaces) if prefix != "xml": namespaces[uri] = prefix if prefix: qnames[qname] = "%s:%s" % (prefix, tag) else: qnames[qname] = tag # default element else: if default_namespace: raise ValueError( "cannot use non-qualified names with " "default_namespace option" ) qnames[qname] = qname except TypeError: # pragma: no cover _raise_serialization_error(qname) # populate qname and namespaces table try: iterate = elem.iter except AttributeError: iterate = elem.getiterator # cET compatibility for elem in iterate(): tag = elem.tag if isinstance(tag, QName) and tag.text not in qnames: add_qname(tag.text) elif isinstance(tag, util.string_type): if tag not in qnames: add_qname(tag) elif tag is not None and tag is not Comment and tag is not PI: _raise_serialization_error(tag) for key, value in elem.items(): if isinstance(key, QName): key = key.text if key not in qnames: add_qname(key) if isinstance(value, QName) and value.text not in qnames: add_qname(value.text) text = elem.text if isinstance(text, QName) and text.text not in qnames: add_qname(text.text) return qnames, namespaces def to_html_string(element): return _write_html(ElementTree(element).getroot(), format="html") def to_xhtml_string(element): return _write_html(ElementTree(element).getroot(), format="xhtml")