Active_prgs/Evidox/addErrorFileFixer.py

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

"""

addErrorFileFixer.py

Created by
Emanuel Borges
08.31.2017

This program will take a directory of dat files and merge them into a 3 dimensional array, ending with 1 large dat file

"""

import codecs, os
from collections import namedtuple

def ConcordanceToList(line, delimChar, quoteChar):
    """Takes a line of UTF formatted Concordance delimited text and returns a list"""
    line = line.replace("\r","")
    line = line.replace("\n","")
    line = line.replace("\ufeff","")
    line = line.replace(quoteChar,"")
    line = line.split(delimChar)
    return line

def GatherAllPossibleFields(pathToFiles, delimChar, quoteChar):
    """cycle through all of the DAT files in this dir and extract out the unique fields in the headder"""
    headderDict = {}
    for f in os.listdir(pathToFiles):
        contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
        headderRow = contents[0]
        headderRow = headderRow.replace(" ","_")
        headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
        for h in headderRow:
            headderDict[h] = 1
    return headderDict

def MergeAllRecords(pathToFiles, delimChar, quoteChar, dataStoreObj,fullHeadderList):
    """cycle through all of the dat files, populating your object"""
    recordDict = {}
    for f in os.listdir(pathToFiles):
        contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
        headderRow = contents[0]
        headderRow = headderRow.replace(" ","_")
        headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
        ## Remove evidoxID from the headder row
        headderRow = headderRow[1:]
        ## use set to get a list of the missing headder fields, since the datastore obj cant have empty values
        #print headderRow
        #print fullHeadderList
        remainderHeadderRow = set(fullHeadderList)-set(headderRow)
        remainderHeadderRow = list(remainderHeadderRow)
        #print remainderHeadderRow
        contents = contents[1:]
        for line in contents:
            line = ConcordanceToList(line, delimChar, quoteChar)
            eID = line[0]
            ## Now remove it so that the headder is back in sync with the line.
            line = line[1:]
            tempDict = {}
            ## Now pack a temp dict with all the possible values, including teh missing ones.
            for i in range(len(headderRow)):
                field = headderRow[i]
                value = line[i]
                #print eID, field, value
                tempDict[field] = value
                for rh in remainderHeadderRow:
                    tempDict[rh] = ""
            ## Now create teh obj entry, using the ** unpack method
            recordDict[eID] = dataStoreObj(**tempDict)
    return dataStoreObj, recordDict

if __name__ == '__main__':
    delimChar = "\x14"
    quoteChar = "\xfe"
    mainDir = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG"
    outputPath = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG - Output"
    outputFileName = '20171122-Output.txt'
    outputHeaderFileName = '20171122-HeaddersOutput.txt'
    

    ##  Gather all the field names to create the 3d array.
    headderDict = GatherAllPossibleFields(mainDir, delimChar, quoteChar)
    headderList = list(headderDict.keys())
    headderList.sort()
    ##  Quickly make a text file with the headers, just to be nice.
    outputHeadderFile = codecs.open(os.path.join(outputPath,outputHeaderFileName),'w',"UTF-8")
    for h in headderList:
        outputHeadderFile.write(h + "\r\n")
    outputHeadderFile.close()

    headderList.remove('EVDXID')
    ## Write the header row in the record outputfile too
    outputRecordFile = codecs.open(os.path.join(outputPath,outputFileName),'w',"UTF-8")
    outputRecordFile.write(quoteChar+'EVDXID'+quoteChar)
    for h in headderList:
        outputRecordFile.write(delimChar+quoteChar+h+quoteChar)
    outputRecordFile.write("\r\n")
    
    
    #print headderList
    ## The * here is the splat opperator
    #ds = namedtuple(*headderList)
    ds = namedtuple('EVDXID',headderList)
    #print ds._fields
    newDS, recordDict = MergeAllRecords(mainDir, delimChar, quoteChar, ds, headderList)
    

    recordList = list(recordDict.keys())
    recordList.sort()
    for record in recordList:
        outputRecordFile.write(quoteChar+record+quoteChar)
        for i in range(len(headderList)):
            outputRecordFile.write(delimChar + quoteChar + recordDict[record][i] + quoteChar)
        outputRecordFile.write("\r\n")
    outputRecordFile.close()
Revision:	746
Committed:	Thu Apr 15 20:11:16 2021 UTC (4 years, 11 months ago) by nino.borges
Content type:	text/x-python
File size:	4846 byte(s)
Log Message:	Updated to be compatible with python3, which made the bak files.
#	User	Rev	Content
1	nino.borges	633	#!/usr/bin/env python
2			# -- coding: UTF-8 --
3
4			"""
5
6			addErrorFileFixer.py
7
8			Created by
9			Emanuel Borges
10			08.31.2017
11
12			This program will take a directory of dat files and merge them into a 3 dimensional array, ending with 1 large dat file
13
14			"""
15
16			import codecs, os
17			from collections import namedtuple
18
19			def ConcordanceToList(line, delimChar, quoteChar):
20			"""Takes a line of UTF formatted Concordance delimited text and returns a list"""
21	nino.borges	746	line = line.replace("\r","")
22			line = line.replace("\n","")
23			line = line.replace("\ufeff","")
24			line = line.replace(quoteChar,"")
25	nino.borges	633	line = line.split(delimChar)
26			return line
27
28			def GatherAllPossibleFields(pathToFiles, delimChar, quoteChar):
29			"""cycle through all of the DAT files in this dir and extract out the unique fields in the headder"""
30			headderDict = {}
31			for f in os.listdir(pathToFiles):
32			contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
33			headderRow = contents[0]
34	nino.borges	746	headderRow = headderRow.replace(" ","_")
35	nino.borges	633	headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
36			for h in headderRow:
37			headderDict[h] = 1
38			return headderDict
39
40			def MergeAllRecords(pathToFiles, delimChar, quoteChar, dataStoreObj,fullHeadderList):
41			"""cycle through all of the dat files, populating your object"""
42			recordDict = {}
43			for f in os.listdir(pathToFiles):
44			contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
45			headderRow = contents[0]
46	nino.borges	746	headderRow = headderRow.replace(" ","_")
47	nino.borges	633	headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
48			## Remove evidoxID from the headder row
49			headderRow = headderRow[1:]
50			## use set to get a list of the missing headder fields, since the datastore obj cant have empty values
51			#print headderRow
52			#print fullHeadderList
53			remainderHeadderRow = set(fullHeadderList)-set(headderRow)
54			remainderHeadderRow = list(remainderHeadderRow)
55			#print remainderHeadderRow
56			contents = contents[1:]
57			for line in contents:
58			line = ConcordanceToList(line, delimChar, quoteChar)
59			eID = line[0]
60			## Now remove it so that the headder is back in sync with the line.
61			line = line[1:]
62			tempDict = {}
63			## Now pack a temp dict with all the possible values, including teh missing ones.
64			for i in range(len(headderRow)):
65			field = headderRow[i]
66			value = line[i]
67			#print eID, field, value
68			tempDict[field] = value
69			for rh in remainderHeadderRow:
70	nino.borges	746	tempDict[rh] = ""
71	nino.borges	633	## Now create teh obj entry, using the ** unpack method
72			recordDict[eID] = dataStoreObj(**tempDict)
73			return dataStoreObj, recordDict
74
75			if __name__ == '__main__':
76	nino.borges	746	delimChar = "\x14"
77			quoteChar = "\xfe"
78	nino.borges	633	mainDir = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG"
79			outputPath = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG - Output"
80			outputFileName = '20171122-Output.txt'
81			outputHeaderFileName = '20171122-HeaddersOutput.txt'
82
83
84			## Gather all the field names to create the 3d array.
85			headderDict = GatherAllPossibleFields(mainDir, delimChar, quoteChar)
86	nino.borges	746	headderList = list(headderDict.keys())
87	nino.borges	633	headderList.sort()
88			## Quickly make a text file with the headers, just to be nice.
89			outputHeadderFile = codecs.open(os.path.join(outputPath,outputHeaderFileName),'w',"UTF-8")
90			for h in headderList:
91	nino.borges	746	outputHeadderFile.write(h + "\r\n")
92	nino.borges	633	outputHeadderFile.close()
93
94	nino.borges	746	headderList.remove('EVDXID')
95	nino.borges	633	## Write the header row in the record outputfile too
96			outputRecordFile = codecs.open(os.path.join(outputPath,outputFileName),'w',"UTF-8")
97	nino.borges	746	outputRecordFile.write(quoteChar+'EVDXID'+quoteChar)
98	nino.borges	633	for h in headderList:
99			outputRecordFile.write(delimChar+quoteChar+h+quoteChar)
100	nino.borges	746	outputRecordFile.write("\r\n")
101	nino.borges	633
102
103			#print headderList
104			## The * here is the splat opperator
105			#ds = namedtuple(*headderList)
106	nino.borges	746	ds = namedtuple('EVDXID',headderList)
107	nino.borges	633	#print ds._fields
108			newDS, recordDict = MergeAllRecords(mainDir, delimChar, quoteChar, ds, headderList)
109
110
111	nino.borges	746	recordList = list(recordDict.keys())
112	nino.borges	633	recordList.sort()
113			for record in recordList:
114			outputRecordFile.write(quoteChar+record+quoteChar)
115			for i in range(len(headderList)):
116			outputRecordFile.write(delimChar + quoteChar + recordDict[record][i] + quoteChar)
117	nino.borges	746	outputRecordFile.write("\r\n")
118	nino.borges	633	outputRecordFile.close()