NinoCode/RandomCodeRequests/Evidox_Glassman.txt

### Glassman: for every document, I had to get a count of the times the word "work product" existed and then list those by claim instead of my doc###

>>> startPath = r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\WP001\Text\001"

>>> matchMatrix = {}
>>> for f in os.listdir(startPath):
...     fContentsString = open(os.path.join(startPath,f)).read()
...     match = re.findall(r'\bWork\W+(?:\w+\W+){0,2}?Product\b', fContentsString)
...     matchMatrix[os.path.splitext(f)[0]] = len(match)
...     
>>> matchMatrix['MAPFRECFEDP000000252']
3

>>> startPath = r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\WP001\Text\002"

>>> for f in os.listdir(startPath):
...     fContentsString = open(os.path.join(startPath,f)).read()
...     match = re.findall(r'\bWork\W+(?:\w+\W+){0,2}?Product\b', fContentsString)
...     matchMatrix[os.path.splitext(f)[0]] = len(match)
... 

>>> contents = open(r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\WP001\Data\CONCORD.DAT").readlines()
>>> contents = contents[1:]
>>> claimMatrix = {}
>>> for line in contents:
...     line = line.replace("\n","")
...     bates,claim = line.split("|")
...     claim = claim.lower()
...     if claim in claimMatrix.keys():
...             claimMatrix[claim].append(bates)
...     else:
...             claimMatrix[claim] = [bates,]
...             
>>> len(claimMatrix.keys())
845
>>> claimMatrix["cann65"]
['MAPFRECFEDP000001418', 'MAPFRECFEDP000001751']


>>> claimList = claimMatrix.keys()
>>> claimList.sort()

>>> outputFile = open(r"C:\Test-PY\Glassman\20180918_claimReport",'w')
>>> for claim in claimList:
...     claimCount = 0
...     for bates in claimMatrix[claim]:
...             claimCount = claimCount + matchMatrix[bates]
...     outputFile.write("%s|%s\n"%(claim,claimCount))
...     
>>> outputFile.close()


### Then I was asked to grab the earliest date in the same text files and add those to the report

>>> dateMatrix = {}
>>> import datetime
>>> for f in os.listdir(startPath):
...     fContentsString = open(os.path.join(startPath,f)).read()
...     dateMatrix[os.path.splitext(f)[0]] = re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{4}\b', fContentsString)
...     
>>> startPath = r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\WP001\Text\001"
>>> for f in os.listdir(startPath):
...     fContentsString = open(os.path.join(startPath,f)).read()
...     dateMatrix[os.path.splitext(f)[0]] = re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{4}\b', fContentsString)
... 


>>> formattedDateMatrix = {}
>>> for bates in dateMatrix.keys():
...     formattedDateList = []
...     for dt in dateMatrix[bates]:
...             try:
...                     formattedDateList.append(datetime.datetime.strptime(dt, "%m/%d/%Y"))
...             except:
...                     try:
...                             formattedDateList.append(datetime.datetime.strptime(dt, "%m-%d-%Y"))
...                     except:
...                             pass
...     if len(formattedDateList) > 0:
...             formattedDateList.sort()
### This last line is an issue.  you will need to rewrite it like the one below (next loop) where it cycles through all dts in this list looking for the oldest but valid date
...             formattedDateMatrix[bates] = formattedDateList[0]
...     

>>> formattedClaimMatrix = {}
>>> for claim in claimMatrix.keys():
...     formattedClaimList = []
...     for bates in claimMatrix[claim]:
...             if bates in formattedDateList:
...                     formattedClaimList.append(formattedDateMatrix[bates])
...     formattedClaimList.sort()
...     earliestDate = False
...     for dt in formattedClaimList:
...             try:
...                     earliestDate = datetime.datetime.strftime(dt, "%m/%d/%Y")
...             except:
...                     pass
...             if earliestDate:
...                     break
...     if earliestDate:
...             formattedClaimMatrix[claim] = earliestDate
... 

>>> formattedClaimDateList = formattedClaimMatrix.keys()
>>> formattedClaimDateList.sort()
>>> for i in formattedClaimDateList:
...     outputFile.write("%s|%s\n"% (i,formattedClaimMatrix[i]))
...     
>>> outputFile.close()

## Just merging the two reports to make one
# Says bates below but that should be claim, since both files are now by claim
>>> outputFile = open(r"C:\Test-PY\Glassman\20180918_claimReportWithDates.txt",'w')
>>> contents = open(r"C:\Test-PY\Glassman\20180918_claimReport.txt").readlines()
>>> for line in contents:
...     line = line.replace("\n","")
...     bates,count = line.split("|")
...     if bates in formattedClaimDateList:
...             outputFile.write("%s|%s|%s\n"% (bates, count, formattedClaimMatrix[bates]))
...     else:
...             outputFile.write("%s|%s|\n"% (bates, count))
...             
>>> outputFile.close()


#### THIS IS THE SAME AS THE PROJECT ABOVE BUT WITH THE DATES FROM THE DAT INSTEAD ####

>>> startPath = r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\NR001\Text\001"

>>> import os
>>> import re
>>> import datetime

>>> matchMatrix = {}
>>> for f in os.listdir(startPath):
...     fContentsString = open(os.path.join(startPath,f)).read()
...     match = re.findall(r'\bNon\W+(?:\w+\W+){0,2}?Responsive\b', fContentsString)
...     matchMatrix[os.path.splitext(f)[0]] = len(match)
... 
>>> matchMatrix['MAPFRECFEDP000000252']
1

>>> contents = open(r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\NR001\Data\CONCORD.DAT").readlines()
>>> contents = contents[1:]
>>> claimMatrix = {}

#  first we populate the date matrix
>>> claimDtMatrix = {}

>>> for line in contents:
...     line = line.replace("\n","")
...     bates,claim, createdDt,lastmodDt = line.split("|")
...     claim = claim.lower()
...     if createdDt:
...             tempDtList = [datetime.datetime.strptime(createdDt, "%m/%d/%Y"),datetime.datetime.strptime(lastmodDt, "%m/%d/%Y")]
...             tempDtList.sort()
...             if claim in claimDtMatrix.keys():
...                     if tempDtList[0] < claimDtMatrix[claim]:
...                             claimDtMatrix[claim] = tempDtList[0]
...             else:
...                     claimDtMatrix[claim] = tempDtList[0]
...                     
>>> claimDtMatrix["ccky60"]
datetime.datetime(2013, 3, 18, 0, 0)

#  Now the main matrix
>>> for line in contents:
...     line = line.replace("\n","")
...     bates,claim, createdDt,lastmodDt = line.split("|")
...     claim = claim.lower()
...     if claim in claimMatrix.keys():
...             claimMatrix[claim].append(bates)
...     else:
...             claimMatrix[claim] = [bates,]
... 

>>> outputFile = open(r"C:\Test-PY\Glassman\20181017_NR_claimReport",'w')

>>> claimList = claimMatrix.keys()

>>> for claim in claimList:
...     claimCount = 0
...     for bates in claimMatrix[claim]:
...             claimCount = claimCount + matchMatrix[bates]
...     if claim in claimDtMatrix.keys():
...             outputFile.write("%s|%s|%s\n"%(claim,claimCount, datetime.datetime.strftime(claimDtMatrix[claim], "%m/%d/%Y")))
...     else:
...             outputFile.write("%s|%s|\n"%(claim,claimCount))
...     
>>> outputFile.close()
Revision:	651
Committed:	Thu Dec 12 20:45:58 2019 UTC (6 years, 3 months ago) by nino.borges
Content type:	text/plain
File size:	7063 byte(s)
Log Message:	small updates
#	Content
1	### Glassman: for every document, I had to get a count of the times the word "work product" existed and then list those by claim instead of my doc###
2
3	>>> startPath = r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\WP001\Text\001"
4
5	>>> matchMatrix = {}
6	>>> for f in os.listdir(startPath):
7	... fContentsString = open(os.path.join(startPath,f)).read()
8	... match = re.findall(r'\bWork\W+(?:\w+\W+){0,2}?Product\b', fContentsString)
9	... matchMatrix[os.path.splitext(f)[0]] = len(match)
10	...
11	>>> matchMatrix['MAPFRECFEDP000000252']
12	3
13
14	>>> startPath = r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\WP001\Text\002"
15
16	>>> for f in os.listdir(startPath):
17	... fContentsString = open(os.path.join(startPath,f)).read()
18	... match = re.findall(r'\bWork\W+(?:\w+\W+){0,2}?Product\b', fContentsString)
19	... matchMatrix[os.path.splitext(f)[0]] = len(match)
20	...
21
22	>>> contents = open(r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\WP001\Data\CONCORD.DAT").readlines()
23	>>> contents = contents[1:]
24	>>> claimMatrix = {}
25	>>> for line in contents:
26	... line = line.replace("\n","")
27	... bates,claim = line.split("\|")
28	... claim = claim.lower()
29	... if claim in claimMatrix.keys():
30	... claimMatrix[claim].append(bates)
31	... else:
32	... claimMatrix[claim] = [bates,]
33	...
34	>>> len(claimMatrix.keys())
35	845
36	>>> claimMatrix["cann65"]
37	['MAPFRECFEDP000001418', 'MAPFRECFEDP000001751']
38
39
40	>>> claimList = claimMatrix.keys()
41	>>> claimList.sort()
42
43	>>> outputFile = open(r"C:\Test-PY\Glassman\20180918_claimReport",'w')
44	>>> for claim in claimList:
45	... claimCount = 0
46	... for bates in claimMatrix[claim]:
47	... claimCount = claimCount + matchMatrix[bates]
48	... outputFile.write("%s\|%s\n"%(claim,claimCount))
49	...
50	>>> outputFile.close()
51
52
53	### Then I was asked to grab the earliest date in the same text files and add those to the report
54
55	>>> dateMatrix = {}
56	>>> import datetime
57	>>> for f in os.listdir(startPath):
58	... fContentsString = open(os.path.join(startPath,f)).read()
59	... dateMatrix[os.path.splitext(f)[0]] = re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{4}\b', fContentsString)
60	...
61	>>> startPath = r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\WP001\Text\001"
62	>>> for f in os.listdir(startPath):
63	... fContentsString = open(os.path.join(startPath,f)).read()
64	... dateMatrix[os.path.splitext(f)[0]] = re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{4}\b', fContentsString)
65	...
66
67
68	>>> formattedDateMatrix = {}
69	>>> for bates in dateMatrix.keys():
70	... formattedDateList = []
71	... for dt in dateMatrix[bates]:
72	... try:
73	... formattedDateList.append(datetime.datetime.strptime(dt, "%m/%d/%Y"))
74	... except:
75	... try:
76	... formattedDateList.append(datetime.datetime.strptime(dt, "%m-%d-%Y"))
77	... except:
78	... pass
79	... if len(formattedDateList) > 0:
80	... formattedDateList.sort()
81	### This last line is an issue. you will need to rewrite it like the one below (next loop) where it cycles through all dts in this list looking for the oldest but valid date
82	... formattedDateMatrix[bates] = formattedDateList[0]
83	...
84
85	>>> formattedClaimMatrix = {}
86	>>> for claim in claimMatrix.keys():
87	... formattedClaimList = []
88	... for bates in claimMatrix[claim]:
89	... if bates in formattedDateList:
90	... formattedClaimList.append(formattedDateMatrix[bates])
91	... formattedClaimList.sort()
92	... earliestDate = False
93	... for dt in formattedClaimList:
94	... try:
95	... earliestDate = datetime.datetime.strftime(dt, "%m/%d/%Y")
96	... except:
97	... pass
98	... if earliestDate:
99	... break
100	... if earliestDate:
101	... formattedClaimMatrix[claim] = earliestDate
102	...
103
104	>>> formattedClaimDateList = formattedClaimMatrix.keys()
105	>>> formattedClaimDateList.sort()
106	>>> for i in formattedClaimDateList:
107	... outputFile.write("%s\|%s\n"% (i,formattedClaimMatrix[i]))
108	...
109	>>> outputFile.close()
110
111	## Just merging the two reports to make one
112	# Says bates below but that should be claim, since both files are now by claim
113	>>> outputFile = open(r"C:\Test-PY\Glassman\20180918_claimReportWithDates.txt",'w')
114	>>> contents = open(r"C:\Test-PY\Glassman\20180918_claimReport.txt").readlines()
115	>>> for line in contents:
116	... line = line.replace("\n","")
117	... bates,count = line.split("\|")
118	... if bates in formattedClaimDateList:
119	... outputFile.write("%s\|%s\|%s\n"% (bates, count, formattedClaimMatrix[bates]))
120	... else:
121	... outputFile.write("%s\|%s\|\n"% (bates, count))
122	...
123	>>> outputFile.close()
124
125
126	#### THIS IS THE SAME AS THE PROJECT ABOVE BUT WITH THE DATES FROM THE DAT INSTEAD ####
127
128	>>> startPath = r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\NR001\Text\001"
129
130	>>> import os
131	>>> import re
132	>>> import datetime
133
134	>>> matchMatrix = {}
135	>>> for f in os.listdir(startPath):
136	... fContentsString = open(os.path.join(startPath,f)).read()
137	... match = re.findall(r'\bNon\W+(?:\w+\W+){0,2}?Responsive\b', fContentsString)
138	... matchMatrix[os.path.splitext(f)[0]] = len(match)
139	...
140	>>> matchMatrix['MAPFRECFEDP000000252']
141	1
142
143	>>> contents = open(r"\\iadcifs01\iproshares01\PSDU-Glassman-Metropolitan\Eclipse\PSDU-Glassman-Metropolitan-Review\Production Outgoing\NR001\Data\CONCORD.DAT").readlines()
144	>>> contents = contents[1:]
145	>>> claimMatrix = {}
146
147	# first we populate the date matrix
148	>>> claimDtMatrix = {}
149
150	>>> for line in contents:
151	... line = line.replace("\n","")
152	... bates,claim, createdDt,lastmodDt = line.split("\|")
153	... claim = claim.lower()
154	... if createdDt:
155	... tempDtList = [datetime.datetime.strptime(createdDt, "%m/%d/%Y"),datetime.datetime.strptime(lastmodDt, "%m/%d/%Y")]
156	... tempDtList.sort()
157	... if claim in claimDtMatrix.keys():
158	... if tempDtList[0] < claimDtMatrix[claim]:
159	... claimDtMatrix[claim] = tempDtList[0]
160	... else:
161	... claimDtMatrix[claim] = tempDtList[0]
162	...
163	>>> claimDtMatrix["ccky60"]
164	datetime.datetime(2013, 3, 18, 0, 0)
165
166	# Now the main matrix
167	>>> for line in contents:
168	... line = line.replace("\n","")
169	... bates,claim, createdDt,lastmodDt = line.split("\|")
170	... claim = claim.lower()
171	... if claim in claimMatrix.keys():
172	... claimMatrix[claim].append(bates)
173	... else:
174	... claimMatrix[claim] = [bates,]
175	...
176
177	>>> outputFile = open(r"C:\Test-PY\Glassman\20181017_NR_claimReport",'w')
178
179	>>> claimList = claimMatrix.keys()
180
181	>>> for claim in claimList:
182	... claimCount = 0
183	... for bates in claimMatrix[claim]:
184	... claimCount = claimCount + matchMatrix[bates]
185	... if claim in claimDtMatrix.keys():
186	... outputFile.write("%s\|%s\|%s\n"%(claim,claimCount, datetime.datetime.strftime(claimDtMatrix[claim], "%m/%d/%Y")))
187	... else:
188	... outputFile.write("%s\|%s\|\n"%(claim,claimCount))
189	...
190	>>> outputFile.close()