| 1 |
nino.borges |
959 |
"""
|
| 2 |
|
|
|
| 3 |
|
|
XmlSchemaInspector
|
| 4 |
|
|
|
| 5 |
|
|
Created by:
|
| 6 |
|
|
Emanuel Borges
|
| 7 |
|
|
11.01.2024
|
| 8 |
|
|
|
| 9 |
|
|
I library that I've been refining to inspect XML data from Relativity, specifically with audit history reports.
|
| 10 |
|
|
This will give me some insight so that I can write a true report generator from these exports.
|
| 11 |
|
|
|
| 12 |
|
|
"""
|
| 13 |
|
|
|
| 14 |
|
|
import csv
|
| 15 |
|
|
import xml.etree.ElementTree as ET
|
| 16 |
|
|
from collections import Counter, defaultdict
|
| 17 |
|
|
from typing import Optional
|
| 18 |
|
|
|
| 19 |
|
|
|
| 20 |
|
|
class XmlSchemaInspector:
|
| 21 |
|
|
def __init__(self) -> None:
|
| 22 |
|
|
self.tag_counts = Counter()
|
| 23 |
|
|
self.attr_counts = defaultdict(Counter) # tag -> attr_name -> count
|
| 24 |
|
|
|
| 25 |
|
|
def add_xml_str(self, s: str) -> None:
|
| 26 |
|
|
s = (s or "").strip()
|
| 27 |
|
|
if not s:
|
| 28 |
|
|
return
|
| 29 |
|
|
try:
|
| 30 |
|
|
root = ET.fromstring(s)
|
| 31 |
|
|
except ET.ParseError:
|
| 32 |
|
|
# Not valid XML, ignore or log if you want
|
| 33 |
|
|
return
|
| 34 |
|
|
self._visit(root)
|
| 35 |
|
|
|
| 36 |
|
|
def _visit(self, elem: ET.Element) -> None:
|
| 37 |
|
|
self.tag_counts[elem.tag] += 1
|
| 38 |
|
|
for attr_name in elem.attrib.keys():
|
| 39 |
|
|
self.attr_counts[elem.tag][attr_name] += 1
|
| 40 |
|
|
for child in list(elem):
|
| 41 |
|
|
self._visit(child)
|
| 42 |
|
|
|
| 43 |
|
|
def print_summary(self) -> None:
|
| 44 |
|
|
print("Tag / attribute summary:")
|
| 45 |
|
|
for tag, count in self.tag_counts.most_common():
|
| 46 |
|
|
print(f" <{tag}>: {count} occurrences")
|
| 47 |
|
|
if self.attr_counts[tag]:
|
| 48 |
|
|
print(" attributes:")
|
| 49 |
|
|
for attr_name, acount in self.attr_counts[tag].most_common():
|
| 50 |
|
|
print(f" {attr_name}: present in {acount} elements")
|
| 51 |
|
|
|
| 52 |
|
|
|
| 53 |
|
|
def inspect_details_xml_from_csv(csv_path: str, limit: Optional[int] = None) -> None:
|
| 54 |
|
|
inspector = XmlSchemaInspector()
|
| 55 |
|
|
count = 0
|
| 56 |
|
|
|
| 57 |
|
|
with open(csv_path, newline="", encoding="utf-8-sig") as f:
|
| 58 |
|
|
reader = csv.DictReader(f)
|
| 59 |
|
|
for row in reader:
|
| 60 |
|
|
details_str = row.get("Details") or ""
|
| 61 |
|
|
inspector.add_xml_str(details_str)
|
| 62 |
|
|
count += 1
|
| 63 |
|
|
if limit is not None and count >= limit:
|
| 64 |
|
|
break
|
| 65 |
|
|
|
| 66 |
|
|
print(f"Processed {count} CSV rows (for XML).")
|
| 67 |
|
|
inspector.print_summary()
|
| 68 |
|
|
|
| 69 |
|
|
|
| 70 |
|
|
if __name__ == "__main__":
|
| 71 |
|
|
csv_path = r"C:\path\to\RelativityAuditReport.csv"
|
| 72 |
|
|
inspect_details_xml_from_csv(csv_path, limit=500)
|
| 73 |
|
|
|