IncomingProdAnalyzer/Trunk/IncomingProdAnalyzer.py

"""

IncomingProdAnalyzer

Created by
Emanuel Borges
2020.02.07

A simple program that I can point to a DAT and it will analyze it for issues like columns not lining up,
what fields they gave in the headder and which ones have stuff and which are totally empty, etc.
Support for UTF quotechars and delims plus removing that little BOM at the beging added.

"""

import chardet, ExcelLib, BatesRangeFunctions, io

def AnalyzeDAT(datFilePath):
    """Returns totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErrorCount, contents, quoteChar, delim"""
    matrix = {}
    headderMatrix = {}
    populatedFieldsList = []
    emptyFieldsList = []

    ## Dummy contents just to figure out encoding
    contents = open(datFilePath).readlines()

    charEncoding = None
    charEncodingCount = 0
    while charEncoding == None:
        charEncoding = chardet.detect(contents[charEncodingCount])['encoding']
        charEncodingCount = charEncodingCount +1

    charEncoding = charEncoding.upper()


##    #print charEncoding
##    if "UTF" in charEncoding:
##        print "UTF found"
##        quoteChar = "\xc3\xbe"
##        headder = contents[0].replace("\xef\xbb\xbf","")
##    else:
##        print "Standard load file found"
##        quoteChar = "\xfe"
##        headder = contents[0]

    if "UTF" in charEncoding:
        print "UTF found"
        contents = io.open(datFilePath, encoding = charEncoding).readlines()
    else:
        print "Standard load file found"
        contents = io.open(datFilePath).readlines()
        
    print charEncoding
    delim = u"\x14"
    quoteChar = u"\xfe"
    headder = contents[0]
    #print contents[0]

    
    headder = headder.replace(quoteChar,"")
    headder = headder.replace("\n","")
    headder = headder.split(delim)
    ##  This headder Matrix is really to look up at the end.  I dont use it for the main matrix below.
    for hSpot, hFieldName in enumerate(headder):
        headderMatrix[hSpot] = hFieldName
    numberOfFields = len(headder)
    contents = contents[1:]


    totalRecordCount = len(contents)
    parsingErrorCount = 0
    for line in contents:
        line = line.replace("\n","")
        line = line.replace(quoteChar,"")
        line = line.split(delim)
        if len(line) == numberOfFields:
            pass
        else:
            print "Warning: number of fields for this line doenst match."
            parsingErrorCount = parsingErrorCount +1
        for itemSpot, value in enumerate(line):
            if value:
                matrix[itemSpot] = 1

    
    for spot in matrix.keys():
        #print headder[spot]
        populatedFieldsList.append(headder[spot])

    
    for hSpot in headderMatrix.keys():
        if hSpot in matrix.keys():
            pass
        else:
            #print headderMatrix[hSpot]
            emptyFieldsList.append(headderMatrix[hSpot])

    fullFieldList = headder

    return totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErrorCount, contents, quoteChar, delim 

def CreateExcelDatReport(spreadSheetPath, fullFieldList, emptyFieldsList, totalRecordCount):
    """Creates a spreadsheet as a pretty report of a field list, noting empty fields in Red highlight"""
    appObj = ExcelLib.ExcelConnection()
    count = 2
    for field in fullFieldList:
        appObj.setCell(1,count,1,field)
        if field in emptyFieldsList:
            appObj.setCellProperties(1,count,1,cellColor=3)
        count = count + 1
    appObj.forceAutoFitRow(1,"A","B")
    appObj.setCell(1,count + 2, 4, "Cells in Red are empty fields")
    appObj.setCell(1,count + 3, 4, "Total Documents: %s"% totalRecordCount)
    appObj.save(spreadSheetPath)
    appObj.close()

def CreateBatesRangeList(colsNumbList, contents, quoteChar, delim):
    batesList = []
    parsingErrorCount = 0
    for line in contents:
        line = line.replace("\n","")
        line = line.replace(quoteChar,"")
        line = line.split(delim)
        for colNumb in colsNumbList:
            batesList.append(line[colNumb])
    batesRangesList = BatesRangeFunctions.GetBatesRanges(batesList)
    return batesRangesList

    
if __name__ == '__main__':
    
    datFilePath = r"\\sas12\sas12\30393\Inbound\11\099878\All American Title Final Distribution Ledger REport\data\All American Title Final Distribution Ledger REport.DAT"
    
    print "Analyzing file..."
    totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErroCount, contents, quoteChar, delim = AnalyzeDAT(datFilePath)
    print ""
    print "There are %s records in this load."%totalRecordCount
    print fullFieldList
    
    print "\nAnalysis completed."
    print ""
    print "-"*10
    print "The following fields exist in this DAT:"
    for i in fullFieldList:
        print i

    print "-"*10
    print ""
    print "The following fields actually contains *some* data:"
    for x in populatedFieldsList:
        print x

    print "-"*10
    print ""
    print "The following fields are totally empty:"
    for y in emptyFieldsList:
        print y

    CreateExcelDatReport(r"c:\temp\foo-FieldReport.xlsx", fullFieldList, emptyFieldsList, totalRecordCount)


Revision:	718
Committed:	Thu Nov 5 22:08:59 2020 UTC (5 years, 4 months ago) by nino.borges
Content type:	text/x-python
File size:	5372 byte(s)
Log Message:	Adding more support for UTF by using io library for better file open control. Updated ver to 2.1
#	Content
1	"""
2
3	IncomingProdAnalyzer
4
5	Created by
6	Emanuel Borges
7	2020.02.07
8
9	A simple program that I can point to a DAT and it will analyze it for issues like columns not lining up,
10	what fields they gave in the headder and which ones have stuff and which are totally empty, etc.
11	Support for UTF quotechars and delims plus removing that little BOM at the beging added.
12
13	"""
14
15	import chardet, ExcelLib, BatesRangeFunctions, io
16
17	def AnalyzeDAT(datFilePath):
18	"""Returns totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErrorCount, contents, quoteChar, delim"""
19	matrix = {}
20	headderMatrix = {}
21	populatedFieldsList = []
22	emptyFieldsList = []
23
24	## Dummy contents just to figure out encoding
25	contents = open(datFilePath).readlines()
26
27	charEncoding = None
28	charEncodingCount = 0
29	while charEncoding == None:
30	charEncoding = chardet.detect(contents[charEncodingCount])['encoding']
31	charEncodingCount = charEncodingCount +1
32
33	charEncoding = charEncoding.upper()
34
35
36	## #print charEncoding
37	## if "UTF" in charEncoding:
38	## print "UTF found"
39	## quoteChar = "\xc3\xbe"
40	## headder = contents[0].replace("\xef\xbb\xbf","")
41	## else:
42	## print "Standard load file found"
43	## quoteChar = "\xfe"
44	## headder = contents[0]
45
46	if "UTF" in charEncoding:
47	print "UTF found"
48	contents = io.open(datFilePath, encoding = charEncoding).readlines()
49	else:
50	print "Standard load file found"
51	contents = io.open(datFilePath).readlines()
52
53	print charEncoding
54	delim = u"\x14"
55	quoteChar = u"\xfe"
56	headder = contents[0]
57	#print contents[0]
58
59
60	headder = headder.replace(quoteChar,"")
61	headder = headder.replace("\n","")
62	headder = headder.split(delim)
63	## This headder Matrix is really to look up at the end. I dont use it for the main matrix below.
64	for hSpot, hFieldName in enumerate(headder):
65	headderMatrix[hSpot] = hFieldName
66	numberOfFields = len(headder)
67	contents = contents[1:]
68
69
70
71	totalRecordCount = len(contents)
72	parsingErrorCount = 0
73	for line in contents:
74	line = line.replace("\n","")
75	line = line.replace(quoteChar,"")
76	line = line.split(delim)
77	if len(line) == numberOfFields:
78	pass
79	else:
80	print "Warning: number of fields for this line doenst match."
81	parsingErrorCount = parsingErrorCount +1
82	for itemSpot, value in enumerate(line):
83	if value:
84	matrix[itemSpot] = 1
85
86
87	for spot in matrix.keys():
88	#print headder[spot]
89	populatedFieldsList.append(headder[spot])
90
91
92	for hSpot in headderMatrix.keys():
93	if hSpot in matrix.keys():
94	pass
95	else:
96	#print headderMatrix[hSpot]
97	emptyFieldsList.append(headderMatrix[hSpot])
98
99	fullFieldList = headder
100
101	return totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErrorCount, contents, quoteChar, delim
102
103	def CreateExcelDatReport(spreadSheetPath, fullFieldList, emptyFieldsList, totalRecordCount):
104	"""Creates a spreadsheet as a pretty report of a field list, noting empty fields in Red highlight"""
105	appObj = ExcelLib.ExcelConnection()
106	count = 2
107	for field in fullFieldList:
108	appObj.setCell(1,count,1,field)
109	if field in emptyFieldsList:
110	appObj.setCellProperties(1,count,1,cellColor=3)
111	count = count + 1
112	appObj.forceAutoFitRow(1,"A","B")
113	appObj.setCell(1,count + 2, 4, "Cells in Red are empty fields")
114	appObj.setCell(1,count + 3, 4, "Total Documents: %s"% totalRecordCount)
115	appObj.save(spreadSheetPath)
116	appObj.close()
117
118	def CreateBatesRangeList(colsNumbList, contents, quoteChar, delim):
119	batesList = []
120	parsingErrorCount = 0
121	for line in contents:
122	line = line.replace("\n","")
123	line = line.replace(quoteChar,"")
124	line = line.split(delim)
125	for colNumb in colsNumbList:
126	batesList.append(line[colNumb])
127	batesRangesList = BatesRangeFunctions.GetBatesRanges(batesList)
128	return batesRangesList
129
130
131
132	if __name__ == '__main__':
133
134	datFilePath = r"\\sas12\sas12\30393\Inbound\11\099878\All American Title Final Distribution Ledger REport\data\All American Title Final Distribution Ledger REport.DAT"
135
136	print "Analyzing file..."
137	totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErroCount, contents, quoteChar, delim = AnalyzeDAT(datFilePath)
138	print ""
139	print "There are %s records in this load."%totalRecordCount
140	print fullFieldList
141
142	print "\nAnalysis completed."
143	print ""
144	print "-"*10
145	print "The following fields exist in this DAT:"
146	for i in fullFieldList:
147	print i
148
149	print "-"*10
150	print ""
151	print "The following fields actually contains some data:"
152	for x in populatedFieldsList:
153	print x
154
155	print "-"*10
156	print ""
157	print "The following fields are totally empty:"
158	for y in emptyFieldsList:
159	print y
160
161	CreateExcelDatReport(r"c:\temp\foo-FieldReport.xlsx", fullFieldList, emptyFieldsList, totalRecordCount)
162
163
164
165