ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Evidox/addErrorFileFixer.py
Revision: 633
Committed: Wed Mar 28 13:54:39 2018 UTC (8 years ago) by nino.borges
Content type: text/x-python
File size: 4854 byte(s)
Log Message:
A folder for my Evidox programs

File Contents

# User Rev Content
1 nino.borges 633 #!/usr/bin/env python
2     # -*- coding: UTF-8 -*-
3    
4     """
5    
6     addErrorFileFixer.py
7    
8     Created by
9     Emanuel Borges
10     08.31.2017
11    
12     This program will take a directory of dat files and merge them into a 3 dimensional array, ending with 1 large dat file
13    
14     """
15    
16     import codecs, os
17     from collections import namedtuple
18    
19     def ConcordanceToList(line, delimChar, quoteChar):
20     """Takes a line of UTF formatted Concordance delimited text and returns a list"""
21     line = line.replace(u"\r",u"")
22     line = line.replace(u"\n",u"")
23     line = line.replace(u"\ufeff",u"")
24     line = line.replace(quoteChar,u"")
25     line = line.split(delimChar)
26     return line
27    
28     def GatherAllPossibleFields(pathToFiles, delimChar, quoteChar):
29     """cycle through all of the DAT files in this dir and extract out the unique fields in the headder"""
30     headderDict = {}
31     for f in os.listdir(pathToFiles):
32     contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
33     headderRow = contents[0]
34     headderRow = headderRow.replace(u" ",u"_")
35     headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
36     for h in headderRow:
37     headderDict[h] = 1
38     return headderDict
39    
40     def MergeAllRecords(pathToFiles, delimChar, quoteChar, dataStoreObj,fullHeadderList):
41     """cycle through all of the dat files, populating your object"""
42     recordDict = {}
43     for f in os.listdir(pathToFiles):
44     contents = codecs.open(os.path.join(pathToFiles,f), "r", "UTF-8").readlines()
45     headderRow = contents[0]
46     headderRow = headderRow.replace(u" ",u"_")
47     headderRow = ConcordanceToList(headderRow, delimChar, quoteChar)
48     ## Remove evidoxID from the headder row
49     headderRow = headderRow[1:]
50     ## use set to get a list of the missing headder fields, since the datastore obj cant have empty values
51     #print headderRow
52     #print fullHeadderList
53     remainderHeadderRow = set(fullHeadderList)-set(headderRow)
54     remainderHeadderRow = list(remainderHeadderRow)
55     #print remainderHeadderRow
56     contents = contents[1:]
57     for line in contents:
58     line = ConcordanceToList(line, delimChar, quoteChar)
59     eID = line[0]
60     ## Now remove it so that the headder is back in sync with the line.
61     line = line[1:]
62     tempDict = {}
63     ## Now pack a temp dict with all the possible values, including teh missing ones.
64     for i in range(len(headderRow)):
65     field = headderRow[i]
66     value = line[i]
67     #print eID, field, value
68     tempDict[field] = value
69     for rh in remainderHeadderRow:
70     tempDict[rh] = u""
71     ## Now create teh obj entry, using the ** unpack method
72     recordDict[eID] = dataStoreObj(**tempDict)
73     return dataStoreObj, recordDict
74    
75     if __name__ == '__main__':
76     delimChar = u"\x14"
77     quoteChar = u"\xfe"
78     mainDir = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG"
79     outputPath = r"\\iadcifs01\iproshares01\EVDX-ADD Analytics Test-Ipro Template-02\QC Fixes\Overlays\20171122 - EBG - Output"
80     outputFileName = '20171122-Output.txt'
81     outputHeaderFileName = '20171122-HeaddersOutput.txt'
82    
83    
84     ## Gather all the field names to create the 3d array.
85     headderDict = GatherAllPossibleFields(mainDir, delimChar, quoteChar)
86     headderList = headderDict.keys()
87     headderList.sort()
88     ## Quickly make a text file with the headers, just to be nice.
89     outputHeadderFile = codecs.open(os.path.join(outputPath,outputHeaderFileName),'w',"UTF-8")
90     for h in headderList:
91     outputHeadderFile.write(h + u"\r\n")
92     outputHeadderFile.close()
93    
94     headderList.remove(u'EVDXID')
95     ## Write the header row in the record outputfile too
96     outputRecordFile = codecs.open(os.path.join(outputPath,outputFileName),'w',"UTF-8")
97     outputRecordFile.write(quoteChar+u'EVDXID'+quoteChar)
98     for h in headderList:
99     outputRecordFile.write(delimChar+quoteChar+h+quoteChar)
100     outputRecordFile.write(u"\r\n")
101    
102    
103     #print headderList
104     ## The * here is the splat opperator
105     #ds = namedtuple(*headderList)
106     ds = namedtuple(u'EVDXID',headderList)
107     #print ds._fields
108     newDS, recordDict = MergeAllRecords(mainDir, delimChar, quoteChar, ds, headderList)
109    
110    
111     recordList = recordDict.keys()
112     recordList.sort()
113     for record in recordList:
114     outputRecordFile.write(quoteChar+record+quoteChar)
115     for i in range(len(headderList)):
116     outputRecordFile.write(delimChar + quoteChar + recordDict[record][i] + quoteChar)
117     outputRecordFile.write(u"\r\n")
118     outputRecordFile.close()