Convert XML documents to dict and easily search for and retrieve the data they contain.


Keywords
XML, document, parse, search
License
Apache-2.0
Install
pip install xmlmanip==1.1.8.dev0

Documentation

from xmlmanip import XMLSchema, SearchableList
string = """
<breakfast_menu>
<food tag="waffles">
    <name>Belgian Waffles</name>
    <price>$5.95</price>
    <description>
   Two of our famous Belgian Waffles with plenty of real maple syrup
   </description>
    <calories>650</calories>
</food>
<food tag="waffles">
    <name >Strawberry Belgian Waffles</name>
    <price>$7.95</price>
    <description>
    Light Belgian waffles covered with strawberries and whipped cream
    </description>
    <calories>900</calories>
</food>
<food tag="waffles">
    <name>Berry-Berry Belgian Waffles</name>
    <price>$8.95</price>
    <description>
    Belgian waffles covered with assorted fresh berries and whipped cream
    </description>
    <calories>900</calories>
</food>
<food tag="toast">
    <name>French Toast</name>
    <price>$4.50</price>
    <description>
    Thick slices made from our homemade sourdough bread
    </description>
    <calories>600</calories>
</food>
<food tag="classic">
    <name>Homestyle Breakfast</name>
    <price>$6.95</price>
    <description>
    Two eggs, bacon or sausage, toast, and our ever-popular hash browns
    </description>
    <calories>950</calories>
</food>
</breakfast_menu>
"""

You can import your XML string to convert it to a dict. (dict conversion handled by https://github.com/martinblech/xmltodict).

schema = XMLSchema(string)
schema
XMLSchema([('breakfast_menu',
            OrderedDict([('food',
                          [OrderedDict([('@tag', 'waffles'),
                                        ('name', 'Belgian Waffles'),
                                        ('price', '$5.95'),
                                        ('description',
                                         'Two of our famous Belgian Waffles with plenty of real maple syrup'),
                                        ('calories', '650')]),
                           OrderedDict([('@tag', 'waffles'),
                                        ('name', 'Strawberry Belgian Waffles'),
                                        ('price', '$7.95'),
                                        ('description',
                                         'Light Belgian waffles covered with strawberries and whipped cream'),
                                        ('calories', '900')]),
                           OrderedDict([('@tag', 'waffles'),
                                        ('name',
                                         'Berry-Berry Belgian Waffles'),
                                        ('price', '$8.95'),
                                        ('description',
                                         'Belgian waffles covered with assorted fresh berries and whipped cream'),
                                        ('calories', '900')]),
                           OrderedDict([('@tag', 'toast'),
                                        ('name', 'French Toast'),
                                        ('price', '$4.50'),
                                        ('description',
                                         'Thick slices made from our homemade sourdough bread'),
                                        ('calories', '600')]),
                           OrderedDict([('@tag', 'classic'),
                                        ('name', 'Homestyle Breakfast'),
                                        ('price', '$6.95'),
                                        ('description',
                                         'Two eggs, bacon or sausage, toast, and our ever-popular hash browns'),
                                        ('calories', '950')])])]))])

Use .search() to search for data of interest.

schema.search(name="Homestyle Breakfast")
[SchemaInnerDict([('@tag', 'classic'),
                  ('name', 'Homestyle Breakfast'),
                  ('price', '$6.95'),
                  ('description',
                   'Two eggs, bacon or sausage, toast, and our ever-popular hash browns'),
                  ('calories', '950')])]

The SearchAbleList class will also allow you to easily search through lists of dicts.

example_list = [{"thing": 1, "other_thing": 2}, {"thing": 2, "other_thing": 2}]
searchable_list = SearchableList(example_list)
print(searchable_list.search(thing__ne=2)) # thing != 2
print(searchable_list.search(other_thing=2))
[{'thing': 1, 'other_thing': 2}]
[{'thing': 1, 'other_thing': 2}, {'thing': 2, 'other_thing': 2}]

Use .locate() if you are interested in the "path" to your data of interest and .retrieve() to get an object from its "path."

schema.locate(name="Homestyle Breakfast")
['__breakfast_menu__food__4__name']
schema.retrieve('__breakfast_menu__food__4__name')
'Homestyle Breakfast'
schema.retrieve('__breakfast_menu__food__4')
SchemaInnerDict([('@tag', 'classic'),
                 ('name', 'Homestyle Breakfast'),
                 ('price', '$6.95'),
                 ('description',
                  'Two eggs, bacon or sausage, toast, and our ever-popular hash browns'),
                 ('calories', '950')])

You have access to all of the standard comparison methods.

paths = schema.locate(name__contains="Waffles")
paths
['__breakfast_menu__food__0__name',
 '__breakfast_menu__food__1__name',
 '__breakfast_menu__food__2__name']
schema.search(name__contains="Waffles")
[SchemaInnerDict([('@tag', 'waffles'),
                  ('name', 'Belgian Waffles'),
                  ('price', '$5.95'),
                  ('description',
                   'Two of our famous Belgian Waffles with plenty of real maple syrup'),
                  ('calories', '650')]),
 SchemaInnerDict([('@tag', 'waffles'),
                  ('name', 'Berry-Berry Belgian Waffles'),
                  ('price', '$8.95'),
                  ('description',
                   'Belgian waffles covered with assorted fresh berries and whipped cream'),
                  ('calories', '900')]),
 SchemaInnerDict([('@tag', 'waffles'),
                  ('name', 'Strawberry Belgian Waffles'),
                  ('price', '$7.95'),
                  ('description',
                   'Light Belgian waffles covered with strawberries and whipped cream'),
                  ('calories', '900')])]
schema.search(calories__lt="700")
[SchemaInnerDict([('@tag', 'toast'),
                  ('name', 'French Toast'),
                  ('price', '$4.50'),
                  ('description',
                   'Thick slices made from our homemade sourdough bread'),
                  ('calories', '600')]),
 SchemaInnerDict([('@tag', 'waffles'),
                  ('name', 'Belgian Waffles'),
                  ('price', '$5.95'),
                  ('description',
                   'Two of our famous Belgian Waffles with plenty of real maple syrup'),
                  ('calories', '650')])]
Warning, all types are compared as strings, which may have undesirable results.
schema.search(calories__lt="700") == schema.search(calories__lt="70") 
True

Some attributes cannot be accessed via keyword arguements, unfortunately.

schema.search(@tag__ne="waffles")
  File "<ipython-input-13-da95e3095c41>", line 1
    schema.search(@tag__ne="waffles")
                  ^
SyntaxError: invalid syntax

You will need to pass the desired attribute and comparison method as strings in this case.

schema.search('@tag', 'waffles') # default comparison is __eq__
[SchemaInnerDict([('@tag', 'waffles'),
                  ('name', 'Belgian Waffles'),
                  ('price', '$5.95'),
                  ('description',
                   'Two of our famous Belgian Waffles with plenty of real maple syrup'),
                  ('calories', '650')]),
 SchemaInnerDict([('@tag', 'waffles'),
                  ('name', 'Strawberry Belgian Waffles'),
                  ('price', '$7.95'),
                  ('description',
                   'Light Belgian waffles covered with strawberries and whipped cream'),
                  ('calories', '900')]),
 SchemaInnerDict([('@tag', 'waffles'),
                  ('name', 'Berry-Berry Belgian Waffles'),
                  ('price', '$8.95'),
                  ('description',
                   'Belgian waffles covered with assorted fresh berries and whipped cream'),
                  ('calories', '900')])]
schema.search('@tag', 'waffles', comparison='ne')
[SchemaInnerDict([('@tag', 'classic'),
                  ('name', 'Homestyle Breakfast'),
                  ('price', '$6.95'),
                  ('description',
                   'Two eggs, bacon or sausage, toast, and our ever-popular hash browns'),
                  ('calories', '950')]),
 SchemaInnerDict([('@tag', 'toast'),
                  ('name', 'French Toast'),
                  ('price', '$4.50'),
                  ('description',
                   'Thick slices made from our homemade sourdough bread'),
                  ('calories', '600')])]