Active_prgs/Evidox/IncomingProdAnalyzer.py

"""

IncomingProdAnalyzer

Created by
Emanuel Borges
2020.02.07

A simple program that I can point to a DAT and it will analyze it for issues like columns not lining up,
what fields they gave in the headder and which ones have stuff and which are totally empty, etc.
Support for UTF quotechars and delims plus removing that little BOM at the beging added.

"""

import chardet

def AnalyzeDAT(datFilePath):
    matrix = {}
    headderMatrix = {}
    populatedFields = []
    emptyFields = []


    contents = open(datFilePath).readlines()

    charEncoding = chardet.detect(contents[0])['encoding'].upper()
    #print charEncoding
    if "UTF" in charEncoding:
        #print "UTF found"
        quoteChar = "\xc3\xbe"
        headder = contents[0].replace("\xef\xbb\xbf","")
    else:
        #print "Standard load file found"
        quoteChar = "\xfe"
        headder = contents[0]
    delim = "\x14"

    
    headder = headder.replace(quoteChar,"")
    headder = headder.split(delim)
    ##  This headder Matrix is really to look up at the end.  I dont use it for the main matrix below.
    for hSpot, hFieldName in enumerate(headder):
        headderMatrix[hSpot] = hFieldName
    numberOfFields = len(headder)
    contents = contents[1:]


    print "Analyzing file..."
    print "There are %s records in this load."%len(contents)
    totalRecordCount = len(contents)
    parsingErrorCount = 0
    for line in contents:
        line = line.replace("\n","")
        line = line.replace(quoteChar,"")
        line = line.split(delim)
        if len(line) == numberOfFields:
            pass
        else:
            print "Warning: number of fields for this line doenst match."
            parsingErrorCount = parsingErrorCount +1
        for itemSpot, value in enumerate(line):
            if value:
                matrix[itemSpot] = 1

    print "Analysis completed."
    print ""
    print "-"*10
    print "The following fields exist in this DAT:"
    for i in headder:
        print i
    print "-"*10
    print ""
    print "The following fields actually contains *some* data:"
    for spot in matrix.keys():
        print headder[spot]

    print "-"*10
    print ""
    print "The following fields are totally empty:"
    for hSpot in headderMatrix.keys():
        if hSpot in matrix.keys():
            pass
        else:
            print headderMatrix[hSpot]

if __name__ == '__main__':
    
    datFilePath = r"\\sas44\sas44\34039\Deliverable\21\NAT006\DATA\NAT006.DAT"
    AnalyzeDAT(datFilePath)
    
    
Revision:	692
Committed:	Wed May 13 15:35:21 2020 UTC (5 years, 10 months ago) by nino.borges
Content type:	text/x-python
File size:	2616 byte(s)
Log Message:	Moving into it's own folder
#	Content
1	"""
2
3	IncomingProdAnalyzer
4
5	Created by
6	Emanuel Borges
7	2020.02.07
8
9	A simple program that I can point to a DAT and it will analyze it for issues like columns not lining up,
10	what fields they gave in the headder and which ones have stuff and which are totally empty, etc.
11	Support for UTF quotechars and delims plus removing that little BOM at the beging added.
12
13	"""
14
15	import chardet
16
17	def AnalyzeDAT(datFilePath):
18	matrix = {}
19	headderMatrix = {}
20	populatedFields = []
21	emptyFields = []
22
23
24	contents = open(datFilePath).readlines()
25
26	charEncoding = chardet.detect(contents[0])['encoding'].upper()
27	#print charEncoding
28	if "UTF" in charEncoding:
29	#print "UTF found"
30	quoteChar = "\xc3\xbe"
31	headder = contents[0].replace("\xef\xbb\xbf","")
32	else:
33	#print "Standard load file found"
34	quoteChar = "\xfe"
35	headder = contents[0]
36	delim = "\x14"
37
38
39	headder = headder.replace(quoteChar,"")
40	headder = headder.split(delim)
41	## This headder Matrix is really to look up at the end. I dont use it for the main matrix below.
42	for hSpot, hFieldName in enumerate(headder):
43	headderMatrix[hSpot] = hFieldName
44	numberOfFields = len(headder)
45	contents = contents[1:]
46
47
48	print "Analyzing file..."
49	print "There are %s records in this load."%len(contents)
50	totalRecordCount = len(contents)
51	parsingErrorCount = 0
52	for line in contents:
53	line = line.replace("\n","")
54	line = line.replace(quoteChar,"")
55	line = line.split(delim)
56	if len(line) == numberOfFields:
57	pass
58	else:
59	print "Warning: number of fields for this line doenst match."
60	parsingErrorCount = parsingErrorCount +1
61	for itemSpot, value in enumerate(line):
62	if value:
63	matrix[itemSpot] = 1
64
65	print "Analysis completed."
66	print ""
67	print "-"*10
68	print "The following fields exist in this DAT:"
69	for i in headder:
70	print i
71	print "-"*10
72	print ""
73	print "The following fields actually contains some data:"
74	for spot in matrix.keys():
75	print headder[spot]
76
77	print "-"*10
78	print ""
79	print "The following fields are totally empty:"
80	for hSpot in headderMatrix.keys():
81	if hSpot in matrix.keys():
82	pass
83	else:
84	print headderMatrix[hSpot]
85
86	if __name__ == '__main__':
87
88	datFilePath = r"\\sas44\sas44\34039\Deliverable\21\NAT006\DATA\NAT006.DAT"
89	AnalyzeDAT(datFilePath)
90
91