IncomingProdAnalyzer/Trunk/IncomingProdAnalyzer.py

"""

IncomingProdAnalyzer

Created by
Emanuel Borges
2020.02.07

A simple program that I can point to a DAT and it will analyze it for issues like columns not lining up,
what fields they gave in the headder and which ones have stuff and which are totally empty, etc.
Support for UTF quotechars and delims plus removing that little BOM at the beging added.

"""

import chardet, ExcelLib, BatesRangeFunctions

def AnalyzeDAT(datFilePath):
    """Returns totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErrorCount, contents, quoteChar, delim"""
    matrix = {}
    headderMatrix = {}
    populatedFieldsList = []
    emptyFieldsList = []


    contents = open(datFilePath).readlines()

    charEncoding = None
    charEncodingCount = 0
    while charEncoding == None:
        charEncoding = chardet.detect(contents[charEncodingCount])['encoding']
        charEncodingCount = charEncodingCount +1

    charEncoding = charEncoding.upper()
    #print charEncoding
    if "UTF" in charEncoding:
        print "UTF found"
        quoteChar = "\xc3\xbe"
        headder = contents[0].replace("\xef\xbb\xbf","")
    else:
        print "Standard load file found"
        quoteChar = "\xfe"
        headder = contents[0]
    delim = "\x14"

    
    headder = headder.replace(quoteChar,"")
    headder = headder.replace("\n","")
    headder = headder.split(delim)
    ##  This headder Matrix is really to look up at the end.  I dont use it for the main matrix below.
    for hSpot, hFieldName in enumerate(headder):
        headderMatrix[hSpot] = hFieldName
    numberOfFields = len(headder)
    contents = contents[1:]


    totalRecordCount = len(contents)
    parsingErrorCount = 0
    for line in contents:
        line = line.replace("\n","")
        line = line.replace(quoteChar,"")
        line = line.split(delim)
        if len(line) == numberOfFields:
            pass
        else:
            print "Warning: number of fields for this line doenst match."
            parsingErrorCount = parsingErrorCount +1
        for itemSpot, value in enumerate(line):
            if value:
                matrix[itemSpot] = 1

    
    for spot in matrix.keys():
        #print headder[spot]
        populatedFieldsList.append(headder[spot])

    
    for hSpot in headderMatrix.keys():
        if hSpot in matrix.keys():
            pass
        else:
            #print headderMatrix[hSpot]
            emptyFieldsList.append(headderMatrix[hSpot])

    fullFieldList = headder

    return totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErrorCount, contents, quoteChar, delim 

def CreateExcelDatReport(spreadSheetPath, fullFieldList, emptyFieldsList, totalRecordCount):
    """Creates a spreadsheet as a pretty report of a field list, noting empty fields in Red highlight"""
    appObj = ExcelLib.ExcelConnection()
    count = 2
    for field in fullFieldList:
        appObj.setCell(1,count,1,field)
        if field in emptyFieldsList:
            appObj.setCellProperties(1,count,1,cellColor=3)
        count = count + 1
    appObj.forceAutoFitRow(1,"A","B")
    appObj.setCell(1,count + 2, 4, "Cells in Red are empty fields")
    appObj.setCell(1,count + 3, 4, "Total Documents: %s"% totalRecordCount)
    appObj.save(spreadSheetPath)
    appObj.close()

def CreateBatesRangeList(colsNumbList, contents, quoteChar, delim):
    batesList = []
    parsingErrorCount = 0
    for line in contents:
        line = line.replace("\n","")
        line = line.replace(quoteChar,"")
        line = line.split(delim)
        for colNumb in colsNumbList:
            batesList.append(line[colNumb])
    batesRangesList = BatesRangeFunctions.GetBatesRanges(batesList)
    return batesRangesList

    
if __name__ == '__main__':
    
    datFilePath = r"\\sas12\sas12\30393\Inbound\11\099878\All American Title Final Distribution Ledger REport\data\All American Title Final Distribution Ledger REport.DAT"
    
    print "Analyzing file..."
    totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErroCount, contents, quoteChar, delim = AnalyzeDAT(datFilePath)
    print ""
    print "There are %s records in this load."%totalRecordCount
    print fullFieldList
    
    print "\nAnalysis completed."
    print ""
    print "-"*10
    print "The following fields exist in this DAT:"
    for i in fullFieldList:
        print i

    print "-"*10
    print ""
    print "The following fields actually contains *some* data:"
    for x in populatedFieldsList:
        print x

    print "-"*10
    print ""
    print "The following fields are totally empty:"
    for y in emptyFieldsList:
        print y

    CreateExcelDatReport(r"c:\temp\foo-FieldReport.xlsx", fullFieldList, emptyFieldsList, totalRecordCount)


Revision:	710
Committed:	Tue Aug 18 22:05:33 2020 UTC (5 years, 7 months ago) by nino.borges
Content type:	text/x-python
File size:	4942 byte(s)
Log Message:	Version 2 which is a major re-write allowing for more tools and functions to be usable on scanned dat. Create excel report, create bates range report.
#	Content
1	"""
2
3	IncomingProdAnalyzer
4
5	Created by
6	Emanuel Borges
7	2020.02.07
8
9	A simple program that I can point to a DAT and it will analyze it for issues like columns not lining up,
10	what fields they gave in the headder and which ones have stuff and which are totally empty, etc.
11	Support for UTF quotechars and delims plus removing that little BOM at the beging added.
12
13	"""
14
15	import chardet, ExcelLib, BatesRangeFunctions
16
17	def AnalyzeDAT(datFilePath):
18	"""Returns totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErrorCount, contents, quoteChar, delim"""
19	matrix = {}
20	headderMatrix = {}
21	populatedFieldsList = []
22	emptyFieldsList = []
23
24
25	contents = open(datFilePath).readlines()
26
27	charEncoding = None
28	charEncodingCount = 0
29	while charEncoding == None:
30	charEncoding = chardet.detect(contents[charEncodingCount])['encoding']
31	charEncodingCount = charEncodingCount +1
32
33	charEncoding = charEncoding.upper()
34	#print charEncoding
35	if "UTF" in charEncoding:
36	print "UTF found"
37	quoteChar = "\xc3\xbe"
38	headder = contents[0].replace("\xef\xbb\xbf","")
39	else:
40	print "Standard load file found"
41	quoteChar = "\xfe"
42	headder = contents[0]
43	delim = "\x14"
44
45
46	headder = headder.replace(quoteChar,"")
47	headder = headder.replace("\n","")
48	headder = headder.split(delim)
49	## This headder Matrix is really to look up at the end. I dont use it for the main matrix below.
50	for hSpot, hFieldName in enumerate(headder):
51	headderMatrix[hSpot] = hFieldName
52	numberOfFields = len(headder)
53	contents = contents[1:]
54
55
56
57	totalRecordCount = len(contents)
58	parsingErrorCount = 0
59	for line in contents:
60	line = line.replace("\n","")
61	line = line.replace(quoteChar,"")
62	line = line.split(delim)
63	if len(line) == numberOfFields:
64	pass
65	else:
66	print "Warning: number of fields for this line doenst match."
67	parsingErrorCount = parsingErrorCount +1
68	for itemSpot, value in enumerate(line):
69	if value:
70	matrix[itemSpot] = 1
71
72
73	for spot in matrix.keys():
74	#print headder[spot]
75	populatedFieldsList.append(headder[spot])
76
77
78	for hSpot in headderMatrix.keys():
79	if hSpot in matrix.keys():
80	pass
81	else:
82	#print headderMatrix[hSpot]
83	emptyFieldsList.append(headderMatrix[hSpot])
84
85	fullFieldList = headder
86
87	return totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErrorCount, contents, quoteChar, delim
88
89	def CreateExcelDatReport(spreadSheetPath, fullFieldList, emptyFieldsList, totalRecordCount):
90	"""Creates a spreadsheet as a pretty report of a field list, noting empty fields in Red highlight"""
91	appObj = ExcelLib.ExcelConnection()
92	count = 2
93	for field in fullFieldList:
94	appObj.setCell(1,count,1,field)
95	if field in emptyFieldsList:
96	appObj.setCellProperties(1,count,1,cellColor=3)
97	count = count + 1
98	appObj.forceAutoFitRow(1,"A","B")
99	appObj.setCell(1,count + 2, 4, "Cells in Red are empty fields")
100	appObj.setCell(1,count + 3, 4, "Total Documents: %s"% totalRecordCount)
101	appObj.save(spreadSheetPath)
102	appObj.close()
103
104	def CreateBatesRangeList(colsNumbList, contents, quoteChar, delim):
105	batesList = []
106	parsingErrorCount = 0
107	for line in contents:
108	line = line.replace("\n","")
109	line = line.replace(quoteChar,"")
110	line = line.split(delim)
111	for colNumb in colsNumbList:
112	batesList.append(line[colNumb])
113	batesRangesList = BatesRangeFunctions.GetBatesRanges(batesList)
114	return batesRangesList
115
116
117
118	if __name__ == '__main__':
119
120	datFilePath = r"\\sas12\sas12\30393\Inbound\11\099878\All American Title Final Distribution Ledger REport\data\All American Title Final Distribution Ledger REport.DAT"
121
122	print "Analyzing file..."
123	totalRecordCount, fullFieldList, populatedFieldsList, emptyFieldsList, parsingErroCount, contents, quoteChar, delim = AnalyzeDAT(datFilePath)
124	print ""
125	print "There are %s records in this load."%totalRecordCount
126	print fullFieldList
127
128	print "\nAnalysis completed."
129	print ""
130	print "-"*10
131	print "The following fields exist in this DAT:"
132	for i in fullFieldList:
133	print i
134
135	print "-"*10
136	print ""
137	print "The following fields actually contains some data:"
138	for x in populatedFieldsList:
139	print x
140
141	print "-"*10
142	print ""
143	print "The following fields are totally empty:"
144	for y in emptyFieldsList:
145	print y
146
147	CreateExcelDatReport(r"c:\temp\foo-FieldReport.xlsx", fullFieldList, emptyFieldsList, totalRecordCount)
148
149
150
151