IncomingProdAnalyzer/Trunk/IncomingProdAnalyzer.py

"""

IncomingProdAnalyzer

Created by
Emanuel Borges
2020.02.07

A simple program that I can point to a DAT and it will analyze it for issues like columns not lining up,
what fields they gave in the headder and which ones have stuff and which are totally empty, etc.
Support for UTF quotechars and delims plus removing that little BOM at the beging added.

"""

import chardet

def AnalyzeDAT(datFilePath):
    """Returns totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErrorCount"""
    matrix = {}
    headderMatrix = {}
    populatedFieldsList = []
    emptyFieldsList = []


    contents = open(datFilePath).readlines()

    charEncoding = None
    charEncodingCount = 0
    while charEncoding == None:
        charEncoding = chardet.detect(contents[charEncodingCount])['encoding']
        charEncodingCount = charEncodingCount +1

    charEncoding = charEncoding.upper()
    #print charEncoding
    if "UTF" in charEncoding:
        print "UTF found"
        quoteChar = "\xc3\xbe"
        headder = contents[0].replace("\xef\xbb\xbf","")
    else:
        print "Standard load file found"
        quoteChar = "\xfe"
        headder = contents[0]
    delim = "\x14"

    
    headder = headder.replace(quoteChar,"")
    headder = headder.split(delim)
    ##  This headder Matrix is really to look up at the end.  I dont use it for the main matrix below.
    for hSpot, hFieldName in enumerate(headder):
        headderMatrix[hSpot] = hFieldName
    numberOfFields = len(headder)
    contents = contents[1:]


    totalRecordCount = len(contents)
    parsingErrorCount = 0
    for line in contents:
        line = line.replace("\n","")
        line = line.replace(quoteChar,"")
        line = line.split(delim)
        if len(line) == numberOfFields:
            pass
        else:
            print "Warning: number of fields for this line doenst match."
            parsingErrorCount = parsingErrorCount +1
        for itemSpot, value in enumerate(line):
            if value:
                matrix[itemSpot] = 1

    
    for spot in matrix.keys():
        #print headder[spot]
        populatedFieldsList.append(headder[spot])

    
    for hSpot in headderMatrix.keys():
        if hSpot in matrix.keys():
            pass
        else:
            #print headderMatrix[hSpot]
            emptyFieldsList.append(headderMatrix[hSpot])

    fullFieldList = headder

    return totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErrorCount

if __name__ == '__main__':
    
    datFilePath = r"\\sas12\sas12\30393\Inbound\11\099878\All American Title Final Distribution Ledger REport\data\All American Title Final Distribution Ledger REport.DAT"
    
    print "Analyzing file..."
    totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErroCount = AnalyzeDAT(datFilePath)
    print ""
    print "There are %s records in this load."%totalRecordCount
    
    print "\nAnalysis completed."
    print ""
    print "-"*10
    print "The following fields exist in this DAT:"
    for i in fullFieldList:
        print i

    print "-"*10
    print ""
    print "The following fields actually contains *some* data:"
    for x in populatedFieldsList:
        print x

    print "-"*10
    print ""
    print "The following fields are totally empty:"
    for y in emptyFieldsList:
        print y


Revision:	704
Committed:	Tue Jul 14 00:14:58 2020 UTC (5 years, 8 months ago) by nino.borges
Content type:	text/x-python
File size:	3505 byte(s)
Log Message:	Setting up the trunk and tags layout and tagging version 1.2, since this is a major version working before I do some big changes.
#	User	Rev	Content
1	nino.borges	672	"""
2
3			IncomingProdAnalyzer
4
5			Created by
6			Emanuel Borges
7			2020.02.07
8
9			A simple program that I can point to a DAT and it will analyze it for issues like columns not lining up,
10			what fields they gave in the headder and which ones have stuff and which are totally empty, etc.
11	nino.borges	687	Support for UTF quotechars and delims plus removing that little BOM at the beging added.
12	nino.borges	672
13			"""
14
15	nino.borges	687	import chardet
16
17	nino.borges	688	def AnalyzeDAT(datFilePath):
18	nino.borges	695	"""Returns totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErrorCount"""
19	nino.borges	672	matrix = {}
20			headderMatrix = {}
21	nino.borges	695	populatedFieldsList = []
22			emptyFieldsList = []
23	nino.borges	688
24
25	nino.borges	687	contents = open(datFilePath).readlines()
26
27	nino.borges	697	charEncoding = None
28			charEncodingCount = 0
29			while charEncoding == None:
30			charEncoding = chardet.detect(contents[charEncodingCount])['encoding']
31			charEncodingCount = charEncodingCount +1
32
33			charEncoding = charEncoding.upper()
34	nino.borges	688	#print charEncoding
35	nino.borges	687	if "UTF" in charEncoding:
36	nino.borges	697	print "UTF found"
37	nino.borges	687	quoteChar = "\xc3\xbe"
38			headder = contents[0].replace("\xef\xbb\xbf","")
39			else:
40	nino.borges	697	print "Standard load file found"
41	nino.borges	687	quoteChar = "\xfe"
42			headder = contents[0]
43	nino.borges	672	delim = "\x14"
44
45	nino.borges	687
46	nino.borges	672	headder = headder.replace(quoteChar,"")
47			headder = headder.split(delim)
48			## This headder Matrix is really to look up at the end. I dont use it for the main matrix below.
49			for hSpot, hFieldName in enumerate(headder):
50			headderMatrix[hSpot] = hFieldName
51			numberOfFields = len(headder)
52			contents = contents[1:]
53
54
55	nino.borges	695
56	nino.borges	688	totalRecordCount = len(contents)
57			parsingErrorCount = 0
58	nino.borges	672	for line in contents:
59			line = line.replace("\n","")
60			line = line.replace(quoteChar,"")
61			line = line.split(delim)
62			if len(line) == numberOfFields:
63			pass
64			else:
65			print "Warning: number of fields for this line doenst match."
66	nino.borges	688	parsingErrorCount = parsingErrorCount +1
67	nino.borges	672	for itemSpot, value in enumerate(line):
68			if value:
69			matrix[itemSpot] = 1
70
71	nino.borges	695
72			for spot in matrix.keys():
73			#print headder[spot]
74			populatedFieldsList.append(headder[spot])
75
76
77			for hSpot in headderMatrix.keys():
78			if hSpot in matrix.keys():
79			pass
80			else:
81			#print headderMatrix[hSpot]
82			emptyFieldsList.append(headderMatrix[hSpot])
83
84			fullFieldList = headder
85
86			return totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErrorCount
87
88			if __name__ == '__main__':
89
90			datFilePath = r"\\sas12\sas12\30393\Inbound\11\099878\All American Title Final Distribution Ledger REport\data\All American Title Final Distribution Ledger REport.DAT"
91
92			print "Analyzing file..."
93			totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErroCount = AnalyzeDAT(datFilePath)
94	nino.borges	672	print ""
95	nino.borges	695	print "There are %s records in this load."%totalRecordCount
96
97			print "\nAnalysis completed."
98			print ""
99	nino.borges	672	print "-"*10
100			print "The following fields exist in this DAT:"
101	nino.borges	695	for i in fullFieldList:
102	nino.borges	672	print i
103	nino.borges	695
104	nino.borges	672	print "-"*10
105			print ""
106			print "The following fields actually contains some data:"
107	nino.borges	695	for x in populatedFieldsList:
108			print x
109	nino.borges	672
110			print "-"*10
111			print ""
112			print "The following fields are totally empty:"
113	nino.borges	695	for y in emptyFieldsList:
114			print y
115	nino.borges	688
116	nino.borges	695
117
118