ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Evidox/addErrorFileFixer.py
Revision: 633
Committed: Wed Mar 28 13:54:39 2018 UTC (8 years ago) by nino.borges
Content type: text/x-python
File size: 4854 byte(s)
Log Message:
A folder for my Evidox programs

File Contents

# Content
1 #!/usr/bin/env python
2 # -*- coding: UTF-8 -*-
3
4 """
5
6 addErrorFileFixer.py
7
8 Created by
9 Emanuel Borges
10 08.31.2017
11
12 This program will take a directory of dat files and merge them into a 3 dimensional array, ending with 1 large dat file
13
14 """
15
16 import codecs, os
17 from collections import namedtuple
18
19 def ConcordanceToList(line, delimChar, quoteChar):
20 """Takes a line of UTF formatted Concordance delimited text and returns a list"""
21 line = line.replace(u"\r",u"")
22 line = line.replace(u"\n",u"")
23 line = line.replace(u"\ufeff",u"")
24 line = line.replace(quoteChar,u"")
25 line = line.split(delimChar)
26 return line
27
28 def GatherAllPossibleFields(pathToFiles, delimChar, quoteChar):
29 """cycle through all of the DAT files in this dir and extract out the unique fields in the headder"""
30 headderDict = {}
31 for f in os.listdir(pathToFiles):
32 contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
33 headderRow = contents[0]
34 headderRow = headderRow.replace(u" ",u"_")
35 headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
36 for h in headderRow:
37 headderDict[h] = 1
38 return headderDict
39
40 def MergeAllRecords(pathToFiles, delimChar, quoteChar, dataStoreObj,fullHeadderList):
41 """cycle through all of the dat files, populating your object"""
42 recordDict = {}
43 for f in os.listdir(pathToFiles):
44 contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
45 headderRow = contents[0]
46 headderRow = headderRow.replace(u" ",u"_")
47 headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
48 ## Remove evidoxID from the headder row
49 headderRow = headderRow[1:]
50 ## use set to get a list of the missing headder fields, since the datastore obj cant have empty values
51 #print headderRow
52 #print fullHeadderList
53 remainderHeadderRow = set(fullHeadderList)-set(headderRow)
54 remainderHeadderRow = list(remainderHeadderRow)
55 #print remainderHeadderRow
56 contents = contents[1:]
57 for line in contents:
58 line = ConcordanceToList(line, delimChar, quoteChar)
59 eID = line[0]
60 ## Now remove it so that the headder is back in sync with the line.
61 line = line[1:]
62 tempDict = {}
63 ## Now pack a temp dict with all the possible values, including teh missing ones.
64 for i in range(len(headderRow)):
65 field = headderRow[i]
66 value = line[i]
67 #print eID, field, value
68 tempDict[field] = value
69 for rh in remainderHeadderRow:
70 tempDict[rh] = u""
71 ## Now create teh obj entry, using the ** unpack method
72 recordDict[eID] = dataStoreObj(**tempDict)
73 return dataStoreObj, recordDict
74
75 if __name__ == '__main__':
76 delimChar = u"\x14"
77 quoteChar = u"\xfe"
78 mainDir = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG"
79 outputPath = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG - Output"
80 outputFileName = '20171122-Output.txt'
81 outputHeaderFileName = '20171122-HeaddersOutput.txt'
82
83
84 ## Gather all the field names to create the 3d array.
85 headderDict = GatherAllPossibleFields(mainDir, delimChar, quoteChar)
86 headderList = headderDict.keys()
87 headderList.sort()
88 ## Quickly make a text file with the headers, just to be nice.
89 outputHeadderFile = codecs.open(os.path.join(outputPath,outputHeaderFileName),'w',"UTF-8")
90 for h in headderList:
91 outputHeadderFile.write(h + u"\r\n")
92 outputHeadderFile.close()
93
94 headderList.remove(u'EVDXID')
95 ## Write the header row in the record outputfile too
96 outputRecordFile = codecs.open(os.path.join(outputPath,outputFileName),'w',"UTF-8")
97 outputRecordFile.write(quoteChar+u'EVDXID'+quoteChar)
98 for h in headderList:
99 outputRecordFile.write(delimChar+quoteChar+h+quoteChar)
100 outputRecordFile.write(u"\r\n")
101
102
103 #print headderList
104 ## The * here is the splat opperator
105 #ds = namedtuple(*headderList)
106 ds = namedtuple(u'EVDXID',headderList)
107 #print ds._fields
108 newDS, recordDict = MergeAllRecords(mainDir, delimChar, quoteChar, ds, headderList)
109
110
111 recordList = recordDict.keys()
112 recordList.sort()
113 for record in recordList:
114 outputRecordFile.write(quoteChar+record+quoteChar)
115 for i in range(len(headderList)):
116 outputRecordFile.write(delimChar + quoteChar + recordDict[record][i] + quoteChar)
117 outputRecordFile.write(u"\r\n")
118 outputRecordFile.close()