logo

Xxml Filter

← Back to Filter List

Xxml


Stores all elements in the input XML document which have any of the attributes specified in unique-attributes or qualified-attributes.

Aliases for this filter

  • xxml
  • xmlsec

Converts from file formats:

  • .xml
  • .html
  • .txt

To file formats:

  • .json
  • .sqlite3

Available settings:

SettingDescriptionDefault
add-new-filesBoolean or list of extensions/patterns to match.False
added-in-versionDexy version when this filter was first available.
additional-doc-filtersFilters to apply to additional documents created as side effects.{}
additional-doc-settingsSettings to apply to additional documents created as side effects.{}
data-typeAlias of custom data class to use to store filter output.keyvalue
examplesTemplates which should be used as examples for this filter.[]
exclude-add-new-filesList of patterns to skip even if they match add-new-files.[]
exclude-new-files-from-dirList of directories to skip when adding new files.[]
extFile extension to output.None
extension-mapDictionary mapping input extensions to default output extensions.None
helpHelpstring for plugin.Stores all elements in the input XML document which have any of the attributes specified in unique-attributes or qualified-attributes.
input-extensionsList of extensions which this filter can accept as input.[u'.xml', u'.html', u'.txt']
keep-originalsWhether, if additional-doc-filters are specified, the original unmodified docs should also be added.False
mkdirA directory which should be created in working dir.None
mkdirsA list of directories which should be created in working dir.[]
nodocWhether filter should be excluded from documentation.False
outputWhether to output results of this filter by default by reporters such as 'output' or 'website'.False
output-extensionsList of extensions which this filter can produce as output.[u'.json', u'.sqlite3']
override-workspace-exclude-filtersIf True, document will be populated to other workspaces ignoring workspace-exclude-filters.False
preserve-prior-data-classWhether output data class should be set to match the input data class.False
pygmentsWhether to apply pygments syntax highlightingTrue
qualified-attributesElements to be added if they have this attribute, to be qualified by element type.[u'name']
require-outputShould dexy raise an exception if no output is produced by this filter?True
tagsTags which describe the filter.[]
unique-attributesElements to be added if they have this attribute, to be treated as globally unique.[u'id']
variablesA dictionary of variable names and values to make available to this filter.{}
varsA dictionary of variable names and values to make available to this filter.{}
workspace-exclude-filtersFilters whose output should be excluded from workspace.[u'pyg']
workspace-includesIf set to a list of filenames or extensions, only these will be populated to working dir.None
Filter Source Code
class XmlSectionFilter(DexyFilter):
    """
    Stores all elements in the input XML document which have any of the
    attributes specified in unique-attributes or qualified-attributes.
    """
    aliases = ["xxml", "xmlsec"]
    _settings = {
            'input-extensions' : [".xml", ".html", ".txt"],
            'pygments' : ("Whether to apply pygments syntax highlighting", True),
            'unique-attributes' : ("Elements to be added if they have this attribute, to be treated as globally unique.", ["id"]),
            'qualified-attributes' : ("Elements to be added if they have this attribute, to be qualified by element type.", ["name"]),
            'data-type' : 'keyvalue',
            'output-extensions' :  [".json", ".sqlite3"]
            }

    def is_active(self):
        return AVAILABLE

    def append_element_attributes_with_key(self, element, element_key):
        source = etree.tostring(element, pretty_print=True).strip()
        inner_html = "\n".join(etree.tostring(child) for child in element.iterchildren())
        self.output_data.append("%s:lineno" % element_key, element.sourceline)
        self.output_data.append("%s:tail" % element_key, element.tail)
        self.output_data.append("%s:text" % element_key, element.text)
        self.output_data.append("%s:tag" % element_key, element.tag)
        self.output_data.append("%s:source" % element_key, source)
        self.output_data.append("%s:inner-html" % element_key, inner_html)

        safe_attrib = {}
        for k, v in element.attrib.iteritems():
            try:
                json.dumps(v)
                safe_attrib[k] = v
            except TypeError:
                pass

        self.output_data.append("%s:attrib" % element_key, json.dumps(safe_attrib))

        if self.setting('pygments'):
            self.output_data.append("%s:html-source" % element_key, highlight(source, self.lexer, self.html_formatter))
            self.output_data.append("%s:latex-source" % element_key, highlight(source, self.lexer, self.latex_formatter))

    def process(self):
        assert self.output_data.state == 'ready'

        if self.setting('pygments'):
            self.lexer = get_lexer_for_filename(self.input_data.storage.data_file())
            self.html_formatter = HtmlFormatter(lineanchors=self.output_data.web_safe_document_key())
            self.latex_formatter = LatexFormatter()

        if self.input_data.ext in ('.xml', '.txt'):
            parser = etree.XMLParser()
        elif self.input_data.ext == '.html':
            parser = etree.HTMLParser()
        else:
            raise Exception("Unsupported extension %s" % self.input_data.ext)

        tree = etree.parse(self.input_data.storage.data_file(), parser)

        for element in tree.iter("*"):
            element_keys = []
           
            for attribute_name in self.setting('unique-attributes'):
                if element.attrib.has_key(attribute_name):
                    element_keys.append(element.attrib[attribute_name])
            for attribute_name in self.setting('qualified-attributes'):
                if element.attrib.has_key(attribute_name):
                    element_keys.append(element.attrib[attribute_name])
                    element_keys.append("%s:%s" % (element.tag, element.attrib[attribute_name]))

            for element_key in element_keys:
                self.append_element_attributes_with_key(element, element_key)

        self.output_data.save()

Content © 2013 Dr. Ana Nelson | Site Design © Copyright 2011 Andre Gagnon | All Rights Reserved.