| 1 |
### Glassman: for every document, I had to get a count of the times the word "work product" existed and then list those by claim instead of my doc###
|
| 2 |
|
| 3 |
>>> startPath = r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\WP001\Text\001"
|
| 4 |
|
| 5 |
>>> matchMatrix = {}
|
| 6 |
>>> for f in os.listdir(startPath):
|
| 7 |
... fContentsString = open(os.path.join(startPath,f)).read()
|
| 8 |
... match = re.findall(r'\bWork\W+(?:\w+\W+){0,2}?Product\b', fContentsString)
|
| 9 |
... matchMatrix[os.path.splitext(f)[0]] = len(match)
|
| 10 |
...
|
| 11 |
>>> matchMatrix['MAPFRECFEDP000000252']
|
| 12 |
3
|
| 13 |
|
| 14 |
>>> startPath = r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\WP001\Text\002"
|
| 15 |
|
| 16 |
>>> for f in os.listdir(startPath):
|
| 17 |
... fContentsString = open(os.path.join(startPath,f)).read()
|
| 18 |
... match = re.findall(r'\bWork\W+(?:\w+\W+){0,2}?Product\b', fContentsString)
|
| 19 |
... matchMatrix[os.path.splitext(f)[0]] = len(match)
|
| 20 |
...
|
| 21 |
|
| 22 |
>>> contents = open(r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\WP001\Data\CONCORD.DAT").readlines()
|
| 23 |
>>> contents = contents[1:]
|
| 24 |
>>> claimMatrix = {}
|
| 25 |
>>> for line in contents:
|
| 26 |
... line = line.replace("\n","")
|
| 27 |
... bates,claim = line.split("|")
|
| 28 |
... claim = claim.lower()
|
| 29 |
... if claim in claimMatrix.keys():
|
| 30 |
... claimMatrix[claim].append(bates)
|
| 31 |
... else:
|
| 32 |
... claimMatrix[claim] = [bates,]
|
| 33 |
...
|
| 34 |
>>> len(claimMatrix.keys())
|
| 35 |
845
|
| 36 |
>>> claimMatrix["cann65"]
|
| 37 |
['MAPFRECFEDP000001418', 'MAPFRECFEDP000001751']
|
| 38 |
|
| 39 |
|
| 40 |
>>> claimList = claimMatrix.keys()
|
| 41 |
>>> claimList.sort()
|
| 42 |
|
| 43 |
>>> outputFile = open(r"C:\Test-PY\Glassman\20180918_claimReport",'w')
|
| 44 |
>>> for claim in claimList:
|
| 45 |
... claimCount = 0
|
| 46 |
... for bates in claimMatrix[claim]:
|
| 47 |
... claimCount = claimCount + matchMatrix[bates]
|
| 48 |
... outputFile.write("%s|%s\n"%(claim,claimCount))
|
| 49 |
...
|
| 50 |
>>> outputFile.close()
|
| 51 |
|
| 52 |
|
| 53 |
### Then I was asked to grab the earliest date in the same text files and add those to the report
|
| 54 |
|
| 55 |
>>> dateMatrix = {}
|
| 56 |
>>> import datetime
|
| 57 |
>>> for f in os.listdir(startPath):
|
| 58 |
... fContentsString = open(os.path.join(startPath,f)).read()
|
| 59 |
... dateMatrix[os.path.splitext(f)[0]] = re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{4}\b', fContentsString)
|
| 60 |
...
|
| 61 |
>>> startPath = r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\WP001\Text\001"
|
| 62 |
>>> for f in os.listdir(startPath):
|
| 63 |
... fContentsString = open(os.path.join(startPath,f)).read()
|
| 64 |
... dateMatrix[os.path.splitext(f)[0]] = re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{4}\b', fContentsString)
|
| 65 |
...
|
| 66 |
|
| 67 |
|
| 68 |
>>> formattedDateMatrix = {}
|
| 69 |
>>> for bates in dateMatrix.keys():
|
| 70 |
... formattedDateList = []
|
| 71 |
... for dt in dateMatrix[bates]:
|
| 72 |
... try:
|
| 73 |
... formattedDateList.append(datetime.datetime.strptime(dt, "%m/%d/%Y"))
|
| 74 |
... except:
|
| 75 |
... try:
|
| 76 |
... formattedDateList.append(datetime.datetime.strptime(dt, "%m-%d-%Y"))
|
| 77 |
... except:
|
| 78 |
... pass
|
| 79 |
... if len(formattedDateList) > 0:
|
| 80 |
... formattedDateList.sort()
|
| 81 |
### This last line is an issue. you will need to rewrite it like the one below (next loop) where it cycles through all dts in this list looking for the oldest but valid date
|
| 82 |
... formattedDateMatrix[bates] = formattedDateList[0]
|
| 83 |
...
|
| 84 |
|
| 85 |
>>> formattedClaimMatrix = {}
|
| 86 |
>>> for claim in claimMatrix.keys():
|
| 87 |
... formattedClaimList = []
|
| 88 |
... for bates in claimMatrix[claim]:
|
| 89 |
... if bates in formattedDateList:
|
| 90 |
... formattedClaimList.append(formattedDateMatrix[bates])
|
| 91 |
... formattedClaimList.sort()
|
| 92 |
... earliestDate = False
|
| 93 |
... for dt in formattedClaimList:
|
| 94 |
... try:
|
| 95 |
... earliestDate = datetime.datetime.strftime(dt, "%m/%d/%Y")
|
| 96 |
... except:
|
| 97 |
... pass
|
| 98 |
... if earliestDate:
|
| 99 |
... break
|
| 100 |
... if earliestDate:
|
| 101 |
... formattedClaimMatrix[claim] = earliestDate
|
| 102 |
...
|
| 103 |
|
| 104 |
>>> formattedClaimDateList = formattedClaimMatrix.keys()
|
| 105 |
>>> formattedClaimDateList.sort()
|
| 106 |
>>> for i in formattedClaimDateList:
|
| 107 |
... outputFile.write("%s|%s\n"% (i,formattedClaimMatrix[i]))
|
| 108 |
...
|
| 109 |
>>> outputFile.close()
|
| 110 |
|
| 111 |
## Just merging the two reports to make one
|
| 112 |
# Says bates below but that should be claim, since both files are now by claim
|
| 113 |
>>> outputFile = open(r"C:\Test-PY\Glassman\20180918_claimReportWithDates.txt",'w')
|
| 114 |
>>> contents = open(r"C:\Test-PY\Glassman\20180918_claimReport.txt").readlines()
|
| 115 |
>>> for line in contents:
|
| 116 |
... line = line.replace("\n","")
|
| 117 |
... bates,count = line.split("|")
|
| 118 |
... if bates in formattedClaimDateList:
|
| 119 |
... outputFile.write("%s|%s|%s\n"% (bates, count, formattedClaimMatrix[bates]))
|
| 120 |
... else:
|
| 121 |
... outputFile.write("%s|%s|\n"% (bates, count))
|
| 122 |
...
|
| 123 |
>>> outputFile.close()
|
| 124 |
|
| 125 |
|
| 126 |
#### THIS IS THE SAME AS THE PROJECT ABOVE BUT WITH THE DATES FROM THE DAT INSTEAD ####
|
| 127 |
|
| 128 |
>>> startPath = r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\NR001\Text\001"
|
| 129 |
|
| 130 |
>>> import os
|
| 131 |
>>> import re
|
| 132 |
>>> import datetime
|
| 133 |
|
| 134 |
>>> matchMatrix = {}
|
| 135 |
>>> for f in os.listdir(startPath):
|
| 136 |
... fContentsString = open(os.path.join(startPath,f)).read()
|
| 137 |
... match = re.findall(r'\bNon\W+(?:\w+\W+){0,2}?Responsive\b', fContentsString)
|
| 138 |
... matchMatrix[os.path.splitext(f)[0]] = len(match)
|
| 139 |
...
|
| 140 |
>>> matchMatrix['MAPFRECFEDP000000252']
|
| 141 |
1
|
| 142 |
|
| 143 |
>>> contents = open(r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\NR001\Data\CONCORD.DAT").readlines()
|
| 144 |
>>> contents = contents[1:]
|
| 145 |
>>> claimMatrix = {}
|
| 146 |
|
| 147 |
# first we populate the date matrix
|
| 148 |
>>> claimDtMatrix = {}
|
| 149 |
|
| 150 |
>>> for line in contents:
|
| 151 |
... line = line.replace("\n","")
|
| 152 |
... bates,claim, createdDt,lastmodDt = line.split("|")
|
| 153 |
... claim = claim.lower()
|
| 154 |
... if createdDt:
|
| 155 |
... tempDtList = [datetime.datetime.strptime(createdDt, "%m/%d/%Y"),datetime.datetime.strptime(lastmodDt, "%m/%d/%Y")]
|
| 156 |
... tempDtList.sort()
|
| 157 |
... if claim in claimDtMatrix.keys():
|
| 158 |
... if tempDtList[0] < claimDtMatrix[claim]:
|
| 159 |
... claimDtMatrix[claim] = tempDtList[0]
|
| 160 |
... else:
|
| 161 |
... claimDtMatrix[claim] = tempDtList[0]
|
| 162 |
...
|
| 163 |
>>> claimDtMatrix["ccky60"]
|
| 164 |
datetime.datetime(2013, 3, 18, 0, 0)
|
| 165 |
|
| 166 |
# Now the main matrix
|
| 167 |
>>> for line in contents:
|
| 168 |
... line = line.replace("\n","")
|
| 169 |
... bates,claim, createdDt,lastmodDt = line.split("|")
|
| 170 |
... claim = claim.lower()
|
| 171 |
... if claim in claimMatrix.keys():
|
| 172 |
... claimMatrix[claim].append(bates)
|
| 173 |
... else:
|
| 174 |
... claimMatrix[claim] = [bates,]
|
| 175 |
...
|
| 176 |
|
| 177 |
>>> outputFile = open(r"C:\Test-PY\Glassman\20181017_NR_claimReport",'w')
|
| 178 |
|
| 179 |
>>> claimList = claimMatrix.keys()
|
| 180 |
|
| 181 |
>>> for claim in claimList:
|
| 182 |
... claimCount = 0
|
| 183 |
... for bates in claimMatrix[claim]:
|
| 184 |
... claimCount = claimCount + matchMatrix[bates]
|
| 185 |
... if claim in claimDtMatrix.keys():
|
| 186 |
... outputFile.write("%s|%s|%s\n"%(claim,claimCount, datetime.datetime.strftime(claimDtMatrix[claim], "%m/%d/%Y")))
|
| 187 |
... else:
|
| 188 |
... outputFile.write("%s|%s|\n"%(claim,claimCount))
|
| 189 |
...
|
| 190 |
>>> outputFile.close() |