Active_prgs/Evidox/addErrorFileFixer.py

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

"""

addErrorFileFixer.py

Created by
Emanuel Borges
08.31.2017

This program will take a directory of dat files and merge them into a 3 dimensional array, ending with 1 large dat file

"""

import codecs, os
from collections import namedtuple

def ConcordanceToList(line, delimChar, quoteChar):
    """Takes a line of UTF formatted Concordance delimited text and returns a list"""
    line = line.replace(u"\r",u"")
    line = line.replace(u"\n",u"")
    line = line.replace(u"\ufeff",u"")
    line = line.replace(quoteChar,u"")
    line = line.split(delimChar)
    return line

def GatherAllPossibleFields(pathToFiles, delimChar, quoteChar):
    """cycle through all of the DAT files in this dir and extract out the unique fields in the headder"""
    headderDict = {}
    for f in os.listdir(pathToFiles):
        contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
        headderRow = contents[0]
        headderRow = headderRow.replace(u" ",u"_")
        headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
        for h in headderRow:
            headderDict[h] = 1
    return headderDict

def MergeAllRecords(pathToFiles, delimChar, quoteChar, dataStoreObj,fullHeadderList):
    """cycle through all of the dat files, populating your object"""
    recordDict = {}
    for f in os.listdir(pathToFiles):
        contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
        headderRow = contents[0]
        headderRow = headderRow.replace(u" ",u"_")
        headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
        ## Remove evidoxID from the headder row
        headderRow = headderRow[1:]
        ## use set to get a list of the missing headder fields, since the datastore obj cant have empty values
        #print headderRow
        #print fullHeadderList
        remainderHeadderRow = set(fullHeadderList)-set(headderRow)
        remainderHeadderRow = list(remainderHeadderRow)
        #print remainderHeadderRow
        contents = contents[1:]
        for line in contents:
            line = ConcordanceToList(line, delimChar, quoteChar)
            eID = line[0]
            ## Now remove it so that the headder is back in sync with the line.
            line = line[1:]
            tempDict = {}
            ## Now pack a temp dict with all the possible values, including teh missing ones.
            for i in range(len(headderRow)):
                field = headderRow[i]
                value = line[i]
                #print eID, field, value
                tempDict[field] = value
                for rh in remainderHeadderRow:
                    tempDict[rh] = u""
            ## Now create teh obj entry, using the ** unpack method
            recordDict[eID] = dataStoreObj(**tempDict)
    return dataStoreObj, recordDict

if __name__ == '__main__':
    delimChar = u"\x14"
    quoteChar = u"\xfe"
    mainDir = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG"
    outputPath = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG - Output"
    outputFileName = '20171122-Output.txt'
    outputHeaderFileName = '20171122-HeaddersOutput.txt'
    

    ##  Gather all the field names to create the 3d array.
    headderDict = GatherAllPossibleFields(mainDir, delimChar, quoteChar)
    headderList = headderDict.keys()
    headderList.sort()
    ##  Quickly make a text file with the headers, just to be nice.
    outputHeadderFile = codecs.open(os.path.join(outputPath,outputHeaderFileName),'w',"UTF-8")
    for h in headderList:
        outputHeadderFile.write(h + u"\r\n")
    outputHeadderFile.close()

    headderList.remove(u'EVDXID')
    ## Write the header row in the record outputfile too
    outputRecordFile = codecs.open(os.path.join(outputPath,outputFileName),'w',"UTF-8")
    outputRecordFile.write(quoteChar+u'EVDXID'+quoteChar)
    for h in headderList:
        outputRecordFile.write(delimChar+quoteChar+h+quoteChar)
    outputRecordFile.write(u"\r\n")
    
    
    #print headderList
    ## The * here is the splat opperator
    #ds = namedtuple(*headderList)
    ds = namedtuple(u'EVDXID',headderList)
    #print ds._fields
    newDS, recordDict = MergeAllRecords(mainDir, delimChar, quoteChar, ds, headderList)
    

    recordList = recordDict.keys()
    recordList.sort()
    for record in recordList:
        outputRecordFile.write(quoteChar+record+quoteChar)
        for i in range(len(headderList)):
            outputRecordFile.write(delimChar + quoteChar + recordDict[record][i] + quoteChar)
        outputRecordFile.write(u"\r\n")
    outputRecordFile.close()
Revision:	633
Committed:	Wed Mar 28 13:54:39 2018 UTC (8 years ago) by nino.borges
Content type:	text/x-python
File size:	4854 byte(s)
Log Message:	A folder for my Evidox programs
#	User	Rev	Content
1	nino.borges	633	#!/usr/bin/env python
2			# -- coding: UTF-8 --
3
4			"""
5
6			addErrorFileFixer.py
7
8			Created by
9			Emanuel Borges
10			08.31.2017
11
12			This program will take a directory of dat files and merge them into a 3 dimensional array, ending with 1 large dat file
13
14			"""
15
16			import codecs, os
17			from collections import namedtuple
18
19			def ConcordanceToList(line, delimChar, quoteChar):
20			"""Takes a line of UTF formatted Concordance delimited text and returns a list"""
21			line = line.replace(u"\r",u"")
22			line = line.replace(u"\n",u"")
23			line = line.replace(u"\ufeff",u"")
24			line = line.replace(quoteChar,u"")
25			line = line.split(delimChar)
26			return line
27
28			def GatherAllPossibleFields(pathToFiles, delimChar, quoteChar):
29			"""cycle through all of the DAT files in this dir and extract out the unique fields in the headder"""
30			headderDict = {}
31			for f in os.listdir(pathToFiles):
32			contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
33			headderRow = contents[0]
34			headderRow = headderRow.replace(u" ",u"_")
35			headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
36			for h in headderRow:
37			headderDict[h] = 1
38			return headderDict
39
40			def MergeAllRecords(pathToFiles, delimChar, quoteChar, dataStoreObj,fullHeadderList):
41			"""cycle through all of the dat files, populating your object"""
42			recordDict = {}
43			for f in os.listdir(pathToFiles):
44			contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
45			headderRow = contents[0]
46			headderRow = headderRow.replace(u" ",u"_")
47			headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
48			## Remove evidoxID from the headder row
49			headderRow = headderRow[1:]
50			## use set to get a list of the missing headder fields, since the datastore obj cant have empty values
51			#print headderRow
52			#print fullHeadderList
53			remainderHeadderRow = set(fullHeadderList)-set(headderRow)
54			remainderHeadderRow = list(remainderHeadderRow)
55			#print remainderHeadderRow
56			contents = contents[1:]
57			for line in contents:
58			line = ConcordanceToList(line, delimChar, quoteChar)
59			eID = line[0]
60			## Now remove it so that the headder is back in sync with the line.
61			line = line[1:]
62			tempDict = {}
63			## Now pack a temp dict with all the possible values, including teh missing ones.
64			for i in range(len(headderRow)):
65			field = headderRow[i]
66			value = line[i]
67			#print eID, field, value
68			tempDict[field] = value
69			for rh in remainderHeadderRow:
70			tempDict[rh] = u""
71			## Now create teh obj entry, using the ** unpack method
72			recordDict[eID] = dataStoreObj(**tempDict)
73			return dataStoreObj, recordDict
74
75			if __name__ == '__main__':
76			delimChar = u"\x14"
77			quoteChar = u"\xfe"
78			mainDir = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG"
79			outputPath = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG - Output"
80			outputFileName = '20171122-Output.txt'
81			outputHeaderFileName = '20171122-HeaddersOutput.txt'
82
83
84			## Gather all the field names to create the 3d array.
85			headderDict = GatherAllPossibleFields(mainDir, delimChar, quoteChar)
86			headderList = headderDict.keys()
87			headderList.sort()
88			## Quickly make a text file with the headers, just to be nice.
89			outputHeadderFile = codecs.open(os.path.join(outputPath,outputHeaderFileName),'w',"UTF-8")
90			for h in headderList:
91			outputHeadderFile.write(h + u"\r\n")
92			outputHeadderFile.close()
93
94			headderList.remove(u'EVDXID')
95			## Write the header row in the record outputfile too
96			outputRecordFile = codecs.open(os.path.join(outputPath,outputFileName),'w',"UTF-8")
97			outputRecordFile.write(quoteChar+u'EVDXID'+quoteChar)
98			for h in headderList:
99			outputRecordFile.write(delimChar+quoteChar+h+quoteChar)
100			outputRecordFile.write(u"\r\n")
101
102
103			#print headderList
104			## The * here is the splat opperator
105			#ds = namedtuple(*headderList)
106			ds = namedtuple(u'EVDXID',headderList)
107			#print ds._fields
108			newDS, recordDict = MergeAllRecords(mainDir, delimChar, quoteChar, ds, headderList)
109
110
111			recordList = recordDict.keys()
112			recordList.sort()
113			for record in recordList:
114			outputRecordFile.write(quoteChar+record+quoteChar)
115			for i in range(len(headderList)):
116			outputRecordFile.write(delimChar + quoteChar + recordDict[record][i] + quoteChar)
117			outputRecordFile.write(u"\r\n")
118			outputRecordFile.close()