diff options
Diffstat (limited to 'source-builder/sb/markdown/preprocessors.py')
-rw-r--r-- | source-builder/sb/markdown/preprocessors.py | 352 |
1 files changed, 352 insertions, 0 deletions
diff --git a/source-builder/sb/markdown/preprocessors.py b/source-builder/sb/markdown/preprocessors.py new file mode 100644 index 0000000..94f9830 --- /dev/null +++ b/source-builder/sb/markdown/preprocessors.py @@ -0,0 +1,352 @@ +""" +PRE-PROCESSORS +============================================================================= + +Preprocessors work on source text before we start doing anything too +complicated. +""" + +from __future__ import absolute_import +from __future__ import unicode_literals +from . import util +from . import odict +import re + + +def build_preprocessors(md_instance, **kwargs): + """ Build the default set of preprocessors used by Markdown. """ + preprocessors = odict.OrderedDict() + preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance) + if md_instance.safeMode != 'escape': + preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance) + preprocessors["reference"] = ReferencePreprocessor(md_instance) + return preprocessors + + +class Preprocessor(util.Processor): + """ + Preprocessors are run after the text is broken into lines. + + Each preprocessor implements a "run" method that takes a pointer to a + list of lines of the document, modifies it as necessary and returns + either the same pointer or a pointer to a new list. + + Preprocessors must extend markdown.Preprocessor. + + """ + def run(self, lines): + """ + Each subclass of Preprocessor should override the `run` method, which + takes the document as a list of strings split by newlines and returns + the (possibly modified) list of lines. + + """ + pass # pragma: no cover + + +class NormalizeWhitespace(Preprocessor): + """ Normalize whitespace for consistant parsing. """ + + def run(self, lines): + source = '\n'.join(lines) + source = source.replace(util.STX, "").replace(util.ETX, "") + source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" + source = source.expandtabs(self.markdown.tab_length) + source = re.sub(r'(?<=\n) +\n', '\n', source) + return source.split('\n') + + +class HtmlBlockPreprocessor(Preprocessor): + """Remove html blocks from the text and store them for later retrieval.""" + + right_tag_patterns = ["</%s>", "%s>"] + attrs_pattern = r""" + \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value" + | # OR + \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value + | # OR + \s+(?P<attr2>[^>"'/= ]+) # attr + """ + left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % \ + attrs_pattern + attrs_re = re.compile(attrs_pattern, re.VERBOSE) + left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) + markdown_in_raw = False + + def _get_left_tag(self, block): + m = self.left_tag_re.match(block) + if m: + tag = m.group('tag') + raw_attrs = m.group('attrs') + attrs = {} + if raw_attrs: + for ma in self.attrs_re.finditer(raw_attrs): + if ma.group('attr'): + if ma.group('value'): + attrs[ma.group('attr').strip()] = ma.group('value') + else: + attrs[ma.group('attr').strip()] = "" + elif ma.group('attr1'): + if ma.group('value1'): + attrs[ma.group('attr1').strip()] = ma.group( + 'value1' + ) + else: + attrs[ma.group('attr1').strip()] = "" + elif ma.group('attr2'): + attrs[ma.group('attr2').strip()] = "" + return tag, len(m.group(0)), attrs + else: + tag = block[1:].split(">", 1)[0].lower() + return tag, len(tag)+2, {} + + def _recursive_tagfind(self, ltag, rtag, start_index, block): + while 1: + i = block.find(rtag, start_index) + if i == -1: + return -1 + j = block.find(ltag, start_index) + # if no ltag, or rtag found before another ltag, return index + if (j > i or j == -1): + return i + len(rtag) + # another ltag found before rtag, use end of ltag as starting + # point and search again + j = block.find('>', j) + start_index = self._recursive_tagfind(ltag, rtag, j + 1, block) + if start_index == -1: + # HTML potentially malformed- ltag has no corresponding + # rtag + return -1 + + def _get_right_tag(self, left_tag, left_index, block): + for p in self.right_tag_patterns: + tag = p % left_tag + i = self._recursive_tagfind( + "<%s" % left_tag, tag, left_index, block + ) + if i > 2: + return tag.lstrip("<").rstrip(">"), i + return block.rstrip()[-left_index:-1].lower(), len(block) + + def _equal_tags(self, left_tag, right_tag): + if left_tag[0] in ['?', '@', '%']: # handle PHP, etc. + return True + if ("/" + left_tag) == right_tag: + return True + if (right_tag == "--" and left_tag == "--"): + return True + elif left_tag == right_tag[1:] and right_tag[0] == "/": + return True + else: + return False + + def _is_oneliner(self, tag): + return (tag in ['hr', 'hr/']) + + def _stringindex_to_listindex(self, stringindex, items): + """ + Same effect as concatenating the strings in items, + finding the character to which stringindex refers in that string, + and returning the index of the item in which that character resides. + """ + items.append('dummy') + i, count = 0, 0 + while count <= stringindex: + count += len(items[i]) + i += 1 + return i - 1 + + def _nested_markdown_in_html(self, items): + """Find and process html child elements of the given element block.""" + for i, item in enumerate(items): + if self.left_tag_re.match(item): + left_tag, left_index, attrs = \ + self._get_left_tag(''.join(items[i:])) + right_tag, data_index = self._get_right_tag( + left_tag, left_index, ''.join(items[i:])) + right_listindex = \ + self._stringindex_to_listindex(data_index, items[i:]) + i + if 'markdown' in attrs.keys(): + items[i] = items[i][left_index:] # remove opening tag + placeholder = self.markdown.htmlStash.store_tag( + left_tag, attrs, i + 1, right_listindex + 1) + items.insert(i, placeholder) + if len(items) - right_listindex <= 1: # last nest, no tail + right_listindex -= 1 + items[right_listindex] = items[right_listindex][ + :-len(right_tag) - 2] # remove closing tag + else: # raw html + if len(items) - right_listindex <= 1: # last element + right_listindex -= 1 + if right_listindex <= i: + right_listindex = i + 1 + placeholder = self.markdown.htmlStash.store('\n\n'.join( + items[i:right_listindex])) + del items[i:right_listindex] + items.insert(i, placeholder) + return items + + def run(self, lines): + text = "\n".join(lines) + new_blocks = [] + text = text.rsplit("\n\n") + items = [] + left_tag = '' + right_tag = '' + in_tag = False # flag + + while text: + block = text[0] + if block.startswith("\n"): + block = block[1:] + text = text[1:] + + if block.startswith("\n"): + block = block[1:] + + if not in_tag: + if block.startswith("<") and len(block.strip()) > 1: + + if block[1:4] == "!--": + # is a comment block + left_tag, left_index, attrs = "--", 2, {} + else: + left_tag, left_index, attrs = self._get_left_tag(block) + right_tag, data_index = self._get_right_tag(left_tag, + left_index, + block) + # keep checking conditions below and maybe just append + + if data_index < len(block) and (util.isBlockLevel(left_tag) or left_tag == '--'): + text.insert(0, block[data_index:]) + block = block[:data_index] + + if not (util.isBlockLevel(left_tag) or block[1] in ["!", "?", "@", "%"]): + new_blocks.append(block) + continue + + if self._is_oneliner(left_tag): + new_blocks.append(block.strip()) + continue + + if block.rstrip().endswith(">") \ + and self._equal_tags(left_tag, right_tag): + if self.markdown_in_raw and 'markdown' in attrs.keys(): + block = block[left_index:-len(right_tag) - 2] + new_blocks.append(self.markdown.htmlStash. + store_tag(left_tag, attrs, 0, 2)) + new_blocks.extend([block]) + else: + new_blocks.append( + self.markdown.htmlStash.store(block.strip())) + continue + else: + # if is block level tag and is not complete + if (not self._equal_tags(left_tag, right_tag)) and \ + (util.isBlockLevel(left_tag) or left_tag == "--"): + items.append(block.strip()) + in_tag = True + else: + new_blocks.append( + self.markdown.htmlStash.store(block.strip()) + ) + continue + + else: + new_blocks.append(block) + + else: + items.append(block) + + # Need to evaluate all items so we can calculate relative to the left index. + right_tag, data_index = self._get_right_tag(left_tag, left_index, ''.join(items)) + # Adjust data_index: relative to items -> relative to last block + prev_block_length = 0 + for item in items[:-1]: + prev_block_length += len(item) + data_index -= prev_block_length + + if self._equal_tags(left_tag, right_tag): + # if find closing tag + + if data_index < len(block): + # we have more text after right_tag + items[-1] = block[:data_index] + text.insert(0, block[data_index:]) + + in_tag = False + if self.markdown_in_raw and 'markdown' in attrs.keys(): + items[0] = items[0][left_index:] + items[-1] = items[-1][:-len(right_tag) - 2] + if items[len(items) - 1]: # not a newline/empty string + right_index = len(items) + 3 + else: + right_index = len(items) + 2 + new_blocks.append(self.markdown.htmlStash.store_tag( + left_tag, attrs, 0, right_index)) + placeholderslen = len(self.markdown.htmlStash.tag_data) + new_blocks.extend( + self._nested_markdown_in_html(items)) + nests = len(self.markdown.htmlStash.tag_data) - \ + placeholderslen + self.markdown.htmlStash.tag_data[-1 - nests][ + 'right_index'] += nests - 2 + else: + new_blocks.append( + self.markdown.htmlStash.store('\n\n'.join(items))) + items = [] + + if items: + if self.markdown_in_raw and 'markdown' in attrs.keys(): + items[0] = items[0][left_index:] + items[-1] = items[-1][:-len(right_tag) - 2] + if items[len(items) - 1]: # not a newline/empty string + right_index = len(items) + 3 + else: + right_index = len(items) + 2 + new_blocks.append( + self.markdown.htmlStash.store_tag( + left_tag, attrs, 0, right_index)) + placeholderslen = len(self.markdown.htmlStash.tag_data) + new_blocks.extend(self._nested_markdown_in_html(items)) + nests = len(self.markdown.htmlStash.tag_data) - placeholderslen + self.markdown.htmlStash.tag_data[-1 - nests][ + 'right_index'] += nests - 2 + else: + new_blocks.append( + self.markdown.htmlStash.store('\n\n'.join(items))) + new_blocks.append('\n') + + new_text = "\n\n".join(new_blocks) + return new_text.split("\n") + + +class ReferencePreprocessor(Preprocessor): + """ Remove reference definitions from text and store for later use. """ + + TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*' + RE = re.compile( + r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL + ) + TITLE_RE = re.compile(r'^%s$' % TITLE) + + def run(self, lines): + new_text = [] + while lines: + line = lines.pop(0) + m = self.RE.match(line) + if m: + id = m.group(1).strip().lower() + link = m.group(2).lstrip('<').rstrip('>') + t = m.group(5) or m.group(6) or m.group(7) + if not t: + # Check next line for title + tm = self.TITLE_RE.match(lines[0]) + if tm: + lines.pop(0) + t = tm.group(2) or tm.group(3) or tm.group(4) + self.markdown.references[id] = (link, t) + else: + new_text.append(line) + + return new_text # + "\n" |