Active_prgs/Evidox/addErrorFileFixer.py

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

"""

addErrorFileFixer.py

Created by
Emanuel Borges
08.31.2017

This program will take a directory of dat files and merge them into a 3 dimensional array, ending with 1 large dat file

"""

import codecs, os
from collections import namedtuple

def ConcordanceToList(line, delimChar, quoteChar):
    """Takes a line of UTF formatted Concordance delimited text and returns a list"""
    line = line.replace(u"\r",u"")
    line = line.replace(u"\n",u"")
    line = line.replace(u"\ufeff",u"")
    line = line.replace(quoteChar,u"")
    line = line.split(delimChar)
    return line

def GatherAllPossibleFields(pathToFiles, delimChar, quoteChar):
    """cycle through all of the DAT files in this dir and extract out the unique fields in the headder"""
    headderDict = {}
    for f in os.listdir(pathToFiles):
        contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
        headderRow = contents[0]
        headderRow = headderRow.replace(u" ",u"_")
        headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
        for h in headderRow:
            headderDict[h] = 1
    return headderDict

def MergeAllRecords(pathToFiles, delimChar, quoteChar, dataStoreObj,fullHeadderList):
    """cycle through all of the dat files, populating your object"""
    recordDict = {}
    for f in os.listdir(pathToFiles):
        contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
        headderRow = contents[0]
        headderRow = headderRow.replace(u" ",u"_")
        headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
        ## Remove evidoxID from the headder row
        headderRow = headderRow[1:]
        ## use set to get a list of the missing headder fields, since the datastore obj cant have empty values
        #print headderRow
        #print fullHeadderList
        remainderHeadderRow = set(fullHeadderList)-set(headderRow)
        remainderHeadderRow = list(remainderHeadderRow)
        #print remainderHeadderRow
        contents = contents[1:]
        for line in contents:
            line = ConcordanceToList(line, delimChar, quoteChar)
            eID = line[0]
            ## Now remove it so that the headder is back in sync with the line.
            line = line[1:]
            tempDict = {}
            ## Now pack a temp dict with all the possible values, including teh missing ones.
            for i in range(len(headderRow)):
                field = headderRow[i]
                value = line[i]
                #print eID, field, value
                tempDict[field] = value
                for rh in remainderHeadderRow:
                    tempDict[rh] = u""
            ## Now create teh obj entry, using the ** unpack method
            recordDict[eID] = dataStoreObj(**tempDict)
    return dataStoreObj, recordDict

if __name__ == '__main__':
    delimChar = u"\x14"
    quoteChar = u"\xfe"
    mainDir = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG"
    outputPath = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG - Output"
    outputFileName = '20171122-Output.txt'
    outputHeaderFileName = '20171122-HeaddersOutput.txt'
    

    ##  Gather all the field names to create the 3d array.
    headderDict = GatherAllPossibleFields(mainDir, delimChar, quoteChar)
    headderList = headderDict.keys()
    headderList.sort()
    ##  Quickly make a text file with the headers, just to be nice.
    outputHeadderFile = codecs.open(os.path.join(outputPath,outputHeaderFileName),'w',"UTF-8")
    for h in headderList:
        outputHeadderFile.write(h + u"\r\n")
    outputHeadderFile.close()

    headderList.remove(u'EVDXID')
    ## Write the header row in the record outputfile too
    outputRecordFile = codecs.open(os.path.join(outputPath,outputFileName),'w',"UTF-8")
    outputRecordFile.write(quoteChar+u'EVDXID'+quoteChar)
    for h in headderList:
        outputRecordFile.write(delimChar+quoteChar+h+quoteChar)
    outputRecordFile.write(u"\r\n")
    
    
    #print headderList
    ## The * here is the splat opperator
    #ds = namedtuple(*headderList)
    ds = namedtuple(u'EVDXID',headderList)
    #print ds._fields
    newDS, recordDict = MergeAllRecords(mainDir, delimChar, quoteChar, ds, headderList)
    

    recordList = recordDict.keys()
    recordList.sort()
    for record in recordList:
        outputRecordFile.write(quoteChar+record+quoteChar)
        for i in range(len(headderList)):
            outputRecordFile.write(delimChar + quoteChar + recordDict[record][i] + quoteChar)
        outputRecordFile.write(u"\r\n")
    outputRecordFile.close()
Revision:	633
Committed:	Wed Mar 28 13:54:39 2018 UTC (8 years ago) by nino.borges
Content type:	text/x-python
File size:	4854 byte(s)
Log Message:	A folder for my Evidox programs
#	Content
1	#!/usr/bin/env python
2	# -- coding: UTF-8 --
3
4	"""
5
6	addErrorFileFixer.py
7
8	Created by
9	Emanuel Borges
10	08.31.2017
11
12	This program will take a directory of dat files and merge them into a 3 dimensional array, ending with 1 large dat file
13
14	"""
15
16	import codecs, os
17	from collections import namedtuple
18
19	def ConcordanceToList(line, delimChar, quoteChar):
20	"""Takes a line of UTF formatted Concordance delimited text and returns a list"""
21	line = line.replace(u"\r",u"")
22	line = line.replace(u"\n",u"")
23	line = line.replace(u"\ufeff",u"")
24	line = line.replace(quoteChar,u"")
25	line = line.split(delimChar)
26	return line
27
28	def GatherAllPossibleFields(pathToFiles, delimChar, quoteChar):
29	"""cycle through all of the DAT files in this dir and extract out the unique fields in the headder"""
30	headderDict = {}
31	for f in os.listdir(pathToFiles):
32	contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
33	headderRow = contents[0]
34	headderRow = headderRow.replace(u" ",u"_")
35	headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
36	for h in headderRow:
37	headderDict[h] = 1
38	return headderDict
39
40	def MergeAllRecords(pathToFiles, delimChar, quoteChar, dataStoreObj,fullHeadderList):
41	"""cycle through all of the dat files, populating your object"""
42	recordDict = {}
43	for f in os.listdir(pathToFiles):
44	contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
45	headderRow = contents[0]
46	headderRow = headderRow.replace(u" ",u"_")
47	headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
48	## Remove evidoxID from the headder row
49	headderRow = headderRow[1:]
50	## use set to get a list of the missing headder fields, since the datastore obj cant have empty values
51	#print headderRow
52	#print fullHeadderList
53	remainderHeadderRow = set(fullHeadderList)-set(headderRow)
54	remainderHeadderRow = list(remainderHeadderRow)
55	#print remainderHeadderRow
56	contents = contents[1:]
57	for line in contents:
58	line = ConcordanceToList(line, delimChar, quoteChar)
59	eID = line[0]
60	## Now remove it so that the headder is back in sync with the line.
61	line = line[1:]
62	tempDict = {}
63	## Now pack a temp dict with all the possible values, including teh missing ones.
64	for i in range(len(headderRow)):
65	field = headderRow[i]
66	value = line[i]
67	#print eID, field, value
68	tempDict[field] = value
69	for rh in remainderHeadderRow:
70	tempDict[rh] = u""
71	## Now create teh obj entry, using the ** unpack method
72	recordDict[eID] = dataStoreObj(**tempDict)
73	return dataStoreObj, recordDict
74
75	if __name__ == '__main__':
76	delimChar = u"\x14"
77	quoteChar = u"\xfe"
78	mainDir = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG"
79	outputPath = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG - Output"
80	outputFileName = '20171122-Output.txt'
81	outputHeaderFileName = '20171122-HeaddersOutput.txt'
82
83
84	## Gather all the field names to create the 3d array.
85	headderDict = GatherAllPossibleFields(mainDir, delimChar, quoteChar)
86	headderList = headderDict.keys()
87	headderList.sort()
88	## Quickly make a text file with the headers, just to be nice.
89	outputHeadderFile = codecs.open(os.path.join(outputPath,outputHeaderFileName),'w',"UTF-8")
90	for h in headderList:
91	outputHeadderFile.write(h + u"\r\n")
92	outputHeadderFile.close()
93
94	headderList.remove(u'EVDXID')
95	## Write the header row in the record outputfile too
96	outputRecordFile = codecs.open(os.path.join(outputPath,outputFileName),'w',"UTF-8")
97	outputRecordFile.write(quoteChar+u'EVDXID'+quoteChar)
98	for h in headderList:
99	outputRecordFile.write(delimChar+quoteChar+h+quoteChar)
100	outputRecordFile.write(u"\r\n")
101
102
103	#print headderList
104	## The * here is the splat opperator
105	#ds = namedtuple(*headderList)
106	ds = namedtuple(u'EVDXID',headderList)
107	#print ds._fields
108	newDS, recordDict = MergeAllRecords(mainDir, delimChar, quoteChar, ds, headderList)
109
110
111	recordList = recordDict.keys()
112	recordList.sort()
113	for record in recordList:
114	outputRecordFile.write(quoteChar+record+quoteChar)
115	for i in range(len(headderList)):
116	outputRecordFile.write(delimChar + quoteChar + recordDict[record][i] + quoteChar)
117	outputRecordFile.write(u"\r\n")
118	outputRecordFile.close()