Active_prgs/Redgrave/Amazon-AnalyzeNamesDeepNormOutput.py

"""

Amazon-AnalyzeNamesDeepNormOutput

Created by:
Emanuel Borges
12.17.2024

This program will assist with analyzing the more complex 'deep' output logs from the Names Norm program.  This will take in all of the vals output files and the priv log,
and will compare the high confidence downgrades and upgrades to determine if these were changed, if there are any surviving attorney values in the legalSource field.

"""


import os

def FieldFullValueDedupe(valuesList):
    """Attempts to deduplicate a list of values from a specific field using the FULL VALUE.  This was created because there appears to be duplicate values int he formatted fields.
    returns a new set with just the cleaned and deduplicated values."""
    ##  Going to do this the long way because of possible uppercase-lowercase issues. These should all be uppercase but there shouldnt have been dups either...
    newSet = set()
    for item in valuesList:
        item = item.strip()
        newSet.add(item.upper())
    return newSet


if __name__ == '__main__':
    ##  VEAS
    valsFilesToIngestList = ['NameNormDeepOutputText(ToVals).txt','NameNormDeepOutputText(ccVals).txt', 'NameNormDeepOutputText(fromVals).txt',
                             'NameNormDeepOutputText(authorValue).txt']
    pathToValsFiles = r"C:\Test_Dir\Amazon\VEAS-CAAG-20241204\Testing3"
    privLogFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
    legalSourcePosition = 40


    ## CAAG
##    valsFilesToIngestList = ['NameNormDeepOutputText(bccVals).txt', 'NameNormDeepOutputText(ToVals).txt','NameNormDeepOutputText(ccVals).txt', 'NameNormDeepOutputText(fromVals).txt',
##                             'NameNormDeepOutputText(docAuthor).txt']
##    pathToValsFiles = r"C:\Test_Dir\Amazon\20241215\20241211_CAAG"
##    privLogFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241215\PrivLogExports\PrivLogExport_20241211_CAAG_Converted.txt"
##    legalSourcePosition = 45

    outputFile = open(r"C:\Test_Dir\Amazon\deepAnalysisTesting.txt",'w')

    highProfileMatrix = {}


    ##  First ingest any high profile downgrades into the matrix.
    for fName in valsFilesToIngestList:
        contents = open(os.path.join(pathToValsFiles,fName)).readlines()
        for line in contents:
            line = line.replace("\n","")
            docID, potIssues = line.split("|")
            potIssues = potIssues.split(";")
            for potIssue in potIssues:
                if "High Confidence Potential downgrade" in potIssue:
                    attnyVal = potIssue.split("(")[0]
                    attnyVal = attnyVal.strip()
                    #attnyVal = attnyVal + ")"
                    try:
                        highProfileMatrix[docID].add(attnyVal.upper())
                        print(f"adding another {highProfileMatrix[docID]}, {docID}")
                    except KeyError:
                        highProfileMatrix[docID] = set()
                        highProfileMatrix[docID].add(attnyVal.upper())

    ##  Second, for each line in the privLog, determine if removing the names from legalSource will result in no surviving attorney values for that doc.
    contents = open(privLogFileName, encoding = 'UTF-8').readlines()
    headerRow = contents[0]
    headerRow = headerRow.split("|")
    contents = contents[1:]
    print(f"The field {headerRow[legalSourcePosition]} will be used for the legal source field.")
    print(f"There are {len(contents)} to test.")

    for line in contents:
        line = line.replace("\n","")
        line = line.split("|")
        docID = line[0]
        if docID in list(highProfileMatrix.keys()):
            #print(docID)
            legalSourceValues = line[legalSourcePosition]
            legalSourceValues = legalSourceValues.split(";")
            legalSourceValues = FieldFullValueDedupe(legalSourceValues)

            downGradeSet = highProfileMatrix[docID]
            remainderLegalSources = legalSourceValues - downGradeSet
            attnyStillExists = False
            for remainValue in remainderLegalSources:
                if "*" in remainValue:
                    attnyStillExists = True

            if attnyStillExists == False:
                outputFile.write(f"Once downgrades removed from {docID}, no attorney names will be left!!\n")
            else:
                outputFile.write(f"Even after removing {';'.join(downGradeSet)}, the following will still be in the legal sources for {docID}, {';'.join(remainderLegalSources)}.\n")

    outputFile.close()
##    print(f"There are {len(highProfileMatrix.keys())} doc IDs with high conf downgrades.")
##    testSet = set()
##    for k in list(highProfileMatrix.keys()):
##        for i in highProfileMatrix[k]:
##            testSet.add(i)
##    print(f"There are {len(testSet)} unique names across all of these.")
##    for i in testSet:
##        print(i)
Revision:	871
Committed:	Fri Dec 20 17:00:12 2024 UTC (15 months ago) by nino.borges
Content type:	text/x-python
File size:	5104 byte(s)
Log Message:	This program will assist with analyzing the more complex 'deep' output logs from the Names Norm program. This will take in all of the vals output files and the priv log, and will compare the high confidence downgrades and upgrades to determine if these were changed, if there are any surviving attorney values in the legalSource field.
#	Content
1	"""
2
3	Amazon-AnalyzeNamesDeepNormOutput
4
5	Created by:
6	Emanuel Borges
7	12.17.2024
8
9	This program will assist with analyzing the more complex 'deep' output logs from the Names Norm program. This will take in all of the vals output files and the priv log,
10	and will compare the high confidence downgrades and upgrades to determine if these were changed, if there are any surviving attorney values in the legalSource field.
11
12	"""
13
14
15	import os
16
17	def FieldFullValueDedupe(valuesList):
18	"""Attempts to deduplicate a list of values from a specific field using the FULL VALUE. This was created because there appears to be duplicate values int he formatted fields.
19	returns a new set with just the cleaned and deduplicated values."""
20	## Going to do this the long way because of possible uppercase-lowercase issues. These should all be uppercase but there shouldnt have been dups either...
21	newSet = set()
22	for item in valuesList:
23	item = item.strip()
24	newSet.add(item.upper())
25	return newSet
26
27
28	if __name__ == '__main__':
29	## VEAS
30	valsFilesToIngestList = ['NameNormDeepOutputText(ToVals).txt','NameNormDeepOutputText(ccVals).txt', 'NameNormDeepOutputText(fromVals).txt',
31	'NameNormDeepOutputText(authorValue).txt']
32	pathToValsFiles = r"C:\Test_Dir\Amazon\VEAS-CAAG-20241204\Testing3"
33	privLogFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
34	legalSourcePosition = 40
35
36
37	## CAAG
38	## valsFilesToIngestList = ['NameNormDeepOutputText(bccVals).txt', 'NameNormDeepOutputText(ToVals).txt','NameNormDeepOutputText(ccVals).txt', 'NameNormDeepOutputText(fromVals).txt',
39	## 'NameNormDeepOutputText(docAuthor).txt']
40	## pathToValsFiles = r"C:\Test_Dir\Amazon\20241215\20241211_CAAG"
41	## privLogFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241215\PrivLogExports\PrivLogExport_20241211_CAAG_Converted.txt"
42	## legalSourcePosition = 45
43
44	outputFile = open(r"C:\Test_Dir\Amazon\deepAnalysisTesting.txt",'w')
45
46	highProfileMatrix = {}
47
48
49	## First ingest any high profile downgrades into the matrix.
50	for fName in valsFilesToIngestList:
51	contents = open(os.path.join(pathToValsFiles,fName)).readlines()
52	for line in contents:
53	line = line.replace("\n","")
54	docID, potIssues = line.split("\|")
55	potIssues = potIssues.split(";")
56	for potIssue in potIssues:
57	if "High Confidence Potential downgrade" in potIssue:
58	attnyVal = potIssue.split("(")[0]
59	attnyVal = attnyVal.strip()
60	#attnyVal = attnyVal + ")"
61	try:
62	highProfileMatrix[docID].add(attnyVal.upper())
63	print(f"adding another {highProfileMatrix[docID]}, {docID}")
64	except KeyError:
65	highProfileMatrix[docID] = set()
66	highProfileMatrix[docID].add(attnyVal.upper())
67
68	## Second, for each line in the privLog, determine if removing the names from legalSource will result in no surviving attorney values for that doc.
69	contents = open(privLogFileName, encoding = 'UTF-8').readlines()
70	headerRow = contents[0]
71	headerRow = headerRow.split("\|")
72	contents = contents[1:]
73	print(f"The field {headerRow[legalSourcePosition]} will be used for the legal source field.")
74	print(f"There are {len(contents)} to test.")
75
76	for line in contents:
77	line = line.replace("\n","")
78	line = line.split("\|")
79	docID = line[0]
80	if docID in list(highProfileMatrix.keys()):
81	#print(docID)
82	legalSourceValues = line[legalSourcePosition]
83	legalSourceValues = legalSourceValues.split(";")
84	legalSourceValues = FieldFullValueDedupe(legalSourceValues)
85
86	downGradeSet = highProfileMatrix[docID]
87	remainderLegalSources = legalSourceValues - downGradeSet
88	attnyStillExists = False
89	for remainValue in remainderLegalSources:
90	if "*" in remainValue:
91	attnyStillExists = True
92
93	if attnyStillExists == False:
94	outputFile.write(f"Once downgrades removed from {docID}, no attorney names will be left!!\n")
95	else:
96	outputFile.write(f"Even after removing {';'.join(downGradeSet)}, the following will still be in the legal sources for {docID}, {';'.join(remainderLegalSources)}.\n")
97
98	outputFile.close()
99	## print(f"There are {len(highProfileMatrix.keys())} doc IDs with high conf downgrades.")
100	## testSet = set()
101	## for k in list(highProfileMatrix.keys()):
102	## for i in highProfileMatrix[k]:
103	## testSet.add(i)
104	## print(f"There are {len(testSet)} unique names across all of these.")
105	## for i in testSet:
106	## print(i)