| 1 |
#!/usr/bin/env python
|
| 2 |
# -*- coding: UTF-8 -*-
|
| 3 |
|
| 4 |
"""
|
| 5 |
|
| 6 |
addErrorFileFixer.py
|
| 7 |
|
| 8 |
Created by
|
| 9 |
Emanuel Borges
|
| 10 |
08.31.2017
|
| 11 |
|
| 12 |
This program will take a directory of dat files and merge them into a 3 dimensional array, ending with 1 large dat file
|
| 13 |
|
| 14 |
"""
|
| 15 |
|
| 16 |
import codecs, os
|
| 17 |
from collections import namedtuple
|
| 18 |
|
| 19 |
def ConcordanceToList(line, delimChar, quoteChar):
|
| 20 |
"""Takes a line of UTF formatted Concordance delimited text and returns a list"""
|
| 21 |
line = line.replace(u"\r",u"")
|
| 22 |
line = line.replace(u"\n",u"")
|
| 23 |
line = line.replace(u"\ufeff",u"")
|
| 24 |
line = line.replace(quoteChar,u"")
|
| 25 |
line = line.split(delimChar)
|
| 26 |
return line
|
| 27 |
|
| 28 |
def GatherAllPossibleFields(pathToFiles, delimChar, quoteChar):
|
| 29 |
"""cycle through all of the DAT files in this dir and extract out the unique fields in the headder"""
|
| 30 |
headderDict = {}
|
| 31 |
for f in os.listdir(pathToFiles):
|
| 32 |
contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
|
| 33 |
headderRow = contents[0]
|
| 34 |
headderRow = headderRow.replace(u" ",u"_")
|
| 35 |
headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
|
| 36 |
for h in headderRow:
|
| 37 |
headderDict[h] = 1
|
| 38 |
return headderDict
|
| 39 |
|
| 40 |
def MergeAllRecords(pathToFiles, delimChar, quoteChar, dataStoreObj,fullHeadderList):
|
| 41 |
"""cycle through all of the dat files, populating your object"""
|
| 42 |
recordDict = {}
|
| 43 |
for f in os.listdir(pathToFiles):
|
| 44 |
contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
|
| 45 |
headderRow = contents[0]
|
| 46 |
headderRow = headderRow.replace(u" ",u"_")
|
| 47 |
headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
|
| 48 |
## Remove evidoxID from the headder row
|
| 49 |
headderRow = headderRow[1:]
|
| 50 |
## use set to get a list of the missing headder fields, since the datastore obj cant have empty values
|
| 51 |
#print headderRow
|
| 52 |
#print fullHeadderList
|
| 53 |
remainderHeadderRow = set(fullHeadderList)-set(headderRow)
|
| 54 |
remainderHeadderRow = list(remainderHeadderRow)
|
| 55 |
#print remainderHeadderRow
|
| 56 |
contents = contents[1:]
|
| 57 |
for line in contents:
|
| 58 |
line = ConcordanceToList(line, delimChar, quoteChar)
|
| 59 |
eID = line[0]
|
| 60 |
## Now remove it so that the headder is back in sync with the line.
|
| 61 |
line = line[1:]
|
| 62 |
tempDict = {}
|
| 63 |
## Now pack a temp dict with all the possible values, including teh missing ones.
|
| 64 |
for i in range(len(headderRow)):
|
| 65 |
field = headderRow[i]
|
| 66 |
value = line[i]
|
| 67 |
#print eID, field, value
|
| 68 |
tempDict[field] = value
|
| 69 |
for rh in remainderHeadderRow:
|
| 70 |
tempDict[rh] = u""
|
| 71 |
## Now create teh obj entry, using the ** unpack method
|
| 72 |
recordDict[eID] = dataStoreObj(**tempDict)
|
| 73 |
return dataStoreObj, recordDict
|
| 74 |
|
| 75 |
if __name__ == '__main__':
|
| 76 |
delimChar = u"\x14"
|
| 77 |
quoteChar = u"\xfe"
|
| 78 |
mainDir = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG"
|
| 79 |
outputPath = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG - Output"
|
| 80 |
outputFileName = '20171122-Output.txt'
|
| 81 |
outputHeaderFileName = '20171122-HeaddersOutput.txt'
|
| 82 |
|
| 83 |
|
| 84 |
## Gather all the field names to create the 3d array.
|
| 85 |
headderDict = GatherAllPossibleFields(mainDir, delimChar, quoteChar)
|
| 86 |
headderList = headderDict.keys()
|
| 87 |
headderList.sort()
|
| 88 |
## Quickly make a text file with the headers, just to be nice.
|
| 89 |
outputHeadderFile = codecs.open(os.path.join(outputPath,outputHeaderFileName),'w',"UTF-8")
|
| 90 |
for h in headderList:
|
| 91 |
outputHeadderFile.write(h + u"\r\n")
|
| 92 |
outputHeadderFile.close()
|
| 93 |
|
| 94 |
headderList.remove(u'EVDXID')
|
| 95 |
## Write the header row in the record outputfile too
|
| 96 |
outputRecordFile = codecs.open(os.path.join(outputPath,outputFileName),'w',"UTF-8")
|
| 97 |
outputRecordFile.write(quoteChar+u'EVDXID'+quoteChar)
|
| 98 |
for h in headderList:
|
| 99 |
outputRecordFile.write(delimChar+quoteChar+h+quoteChar)
|
| 100 |
outputRecordFile.write(u"\r\n")
|
| 101 |
|
| 102 |
|
| 103 |
#print headderList
|
| 104 |
## The * here is the splat opperator
|
| 105 |
#ds = namedtuple(*headderList)
|
| 106 |
ds = namedtuple(u'EVDXID',headderList)
|
| 107 |
#print ds._fields
|
| 108 |
newDS, recordDict = MergeAllRecords(mainDir, delimChar, quoteChar, ds, headderList)
|
| 109 |
|
| 110 |
|
| 111 |
recordList = recordDict.keys()
|
| 112 |
recordList.sort()
|
| 113 |
for record in recordList:
|
| 114 |
outputRecordFile.write(quoteChar+record+quoteChar)
|
| 115 |
for i in range(len(headderList)):
|
| 116 |
outputRecordFile.write(delimChar + quoteChar + recordDict[record][i] + quoteChar)
|
| 117 |
outputRecordFile.write(u"\r\n")
|
| 118 |
outputRecordFile.close()
|