Evidox/IncomingProdAnalyzer/IncomingProdAnalyzer.py

"""

IncomingProdAnalyzer

Created by
Emanuel Borges
2020.02.07

A simple program that I can point to a DAT and it will analyze it for issues like columns not lining up,
what fields they gave in the headder and which ones have stuff and which are totally empty, etc.
Support for UTF quotechars and delims plus removing that little BOM at the beging added.

"""

import chardet

def AnalyzeDAT(datFilePath):
    """Returns totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErrorCount"""
    matrix = {}
    headderMatrix = {}
    populatedFieldsList = []
    emptyFieldsList = []


    contents = open(datFilePath).readlines()

    charEncoding = None
    charEncodingCount = 0
    while charEncoding == None:
        charEncoding = chardet.detect(contents[charEncodingCount])['encoding']
        charEncodingCount = charEncodingCount +1

    charEncoding = charEncoding.upper()
    #print charEncoding
    if "UTF" in charEncoding:
        print "UTF found"
        quoteChar = "\xc3\xbe"
        headder = contents[0].replace("\xef\xbb\xbf","")
    else:
        print "Standard load file found"
        quoteChar = "\xfe"
        headder = contents[0]
    delim = "\x14"

    
    headder = headder.replace(quoteChar,"")
    headder = headder.split(delim)
    ##  This headder Matrix is really to look up at the end.  I dont use it for the main matrix below.
    for hSpot, hFieldName in enumerate(headder):
        headderMatrix[hSpot] = hFieldName
    numberOfFields = len(headder)
    contents = contents[1:]


    totalRecordCount = len(contents)
    parsingErrorCount = 0
    for line in contents:
        line = line.replace("\n","")
        line = line.replace(quoteChar,"")
        line = line.split(delim)
        if len(line) == numberOfFields:
            pass
        else:
            print "Warning: number of fields for this line doenst match."
            parsingErrorCount = parsingErrorCount +1
        for itemSpot, value in enumerate(line):
            if value:
                matrix[itemSpot] = 1

    
    for spot in matrix.keys():
        #print headder[spot]
        populatedFieldsList.append(headder[spot])

    
    for hSpot in headderMatrix.keys():
        if hSpot in matrix.keys():
            pass
        else:
            #print headderMatrix[hSpot]
            emptyFieldsList.append(headderMatrix[hSpot])

    fullFieldList = headder

    return totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErrorCount

if __name__ == '__main__':
    
    datFilePath = r"\\sas12\sas12\30393\Inbound\11\099878\All American Title Final Distribution Ledger REport\data\All American Title Final Distribution Ledger REport.DAT"
    
    print "Analyzing file..."
    totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErroCount = AnalyzeDAT(datFilePath)
    print ""
    print "There are %s records in this load."%totalRecordCount
    
    print "\nAnalysis completed."
    print ""
    print "-"*10
    print "The following fields exist in this DAT:"
    for i in fullFieldList:
        print i

    print "-"*10
    print ""
    print "The following fields actually contains *some* data:"
    for x in populatedFieldsList:
        print x

    print "-"*10
    print ""
    print "The following fields are totally empty:"
    for y in emptyFieldsList:
        print y


Revision:	697
Committed:	Wed May 13 21:48:24 2020 UTC (5 years, 10 months ago) by nino.borges
Content type:	text/x-python
File size:	3505 byte(s)
Log Message:	Finished up the dat report dialog and added a bit more code to try to determine the encoding.
#	Content
1	"""
2
3	IncomingProdAnalyzer
4
5	Created by
6	Emanuel Borges
7	2020.02.07
8
9	A simple program that I can point to a DAT and it will analyze it for issues like columns not lining up,
10	what fields they gave in the headder and which ones have stuff and which are totally empty, etc.
11	Support for UTF quotechars and delims plus removing that little BOM at the beging added.
12
13	"""
14
15	import chardet
16
17	def AnalyzeDAT(datFilePath):
18	"""Returns totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErrorCount"""
19	matrix = {}
20	headderMatrix = {}
21	populatedFieldsList = []
22	emptyFieldsList = []
23
24
25	contents = open(datFilePath).readlines()
26
27	charEncoding = None
28	charEncodingCount = 0
29	while charEncoding == None:
30	charEncoding = chardet.detect(contents[charEncodingCount])['encoding']
31	charEncodingCount = charEncodingCount +1
32
33	charEncoding = charEncoding.upper()
34	#print charEncoding
35	if "UTF" in charEncoding:
36	print "UTF found"
37	quoteChar = "\xc3\xbe"
38	headder = contents[0].replace("\xef\xbb\xbf","")
39	else:
40	print "Standard load file found"
41	quoteChar = "\xfe"
42	headder = contents[0]
43	delim = "\x14"
44
45
46	headder = headder.replace(quoteChar,"")
47	headder = headder.split(delim)
48	## This headder Matrix is really to look up at the end. I dont use it for the main matrix below.
49	for hSpot, hFieldName in enumerate(headder):
50	headderMatrix[hSpot] = hFieldName
51	numberOfFields = len(headder)
52	contents = contents[1:]
53
54
55
56	totalRecordCount = len(contents)
57	parsingErrorCount = 0
58	for line in contents:
59	line = line.replace("\n","")
60	line = line.replace(quoteChar,"")
61	line = line.split(delim)
62	if len(line) == numberOfFields:
63	pass
64	else:
65	print "Warning: number of fields for this line doenst match."
66	parsingErrorCount = parsingErrorCount +1
67	for itemSpot, value in enumerate(line):
68	if value:
69	matrix[itemSpot] = 1
70
71
72	for spot in matrix.keys():
73	#print headder[spot]
74	populatedFieldsList.append(headder[spot])
75
76
77	for hSpot in headderMatrix.keys():
78	if hSpot in matrix.keys():
79	pass
80	else:
81	#print headderMatrix[hSpot]
82	emptyFieldsList.append(headderMatrix[hSpot])
83
84	fullFieldList = headder
85
86	return totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErrorCount
87
88	if __name__ == '__main__':
89
90	datFilePath = r"\\sas12\sas12\30393\Inbound\11\099878\All American Title Final Distribution Ledger REport\data\All American Title Final Distribution Ledger REport.DAT"
91
92	print "Analyzing file..."
93	totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErroCount = AnalyzeDAT(datFilePath)
94	print ""
95	print "There are %s records in this load."%totalRecordCount
96
97	print "\nAnalysis completed."
98	print ""
99	print "-"*10
100	print "The following fields exist in this DAT:"
101	for i in fullFieldList:
102	print i
103
104	print "-"*10
105	print ""
106	print "The following fields actually contains some data:"
107	for x in populatedFieldsList:
108	print x
109
110	print "-"*10
111	print ""
112	print "The following fields are totally empty:"
113	for y in emptyFieldsList:
114	print y
115
116
117
118