ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/PerligoDupeGroup.py
Revision: 581
Committed: Thu Mar 26 13:51:25 2015 UTC (11 years ago) by nino.borges
Content type: text/x-python
File size: 2965 byte(s)
Log Message:
Add before leaving

File Contents

# Content
1 """
2 PerligoDupeGroup
3
4 This project for ashely was to group documents by "near dupe" and report
5
6 """
7
8 if __name__ =='__main__':
9 matrix = {}
10 matrix2 = {}
11 count = 1
12 contents = open(r"C:\Users\eborges\Box Sync\Client\Honeywell\Perligo\20150109_dedupeProj\NearDupesExport.dat").readlines()
13 for line in contents:
14 line = line.replace("\n","")
15 line = line.replace("\r","")
16 line = line.split("|")
17 test = None
18 for docID in line:
19 if docID in matrix.keys():
20 test = matrix[docID]
21 if test:
22 pass
23 else:
24 test = count
25 count = count + 1
26 for docID in line:
27 matrix[docID] = test
28 print "initial matrix done. %d count"% len(matrix.keys())
29 for begNo in matrix.keys():
30 group = matrix[begNo]
31 if group in matrix2.keys():
32 if begNo in matrix2[group]:
33 pass
34 else:
35 matrix2[group].append(begNo)
36 else:
37 matrix2[group] = [begNo,]
38 outputFile = open(r"C:\Users\eborges\Box Sync\Client\Honeywell\Perligo\20150109_dedupeProj\NearDupesOutput.dat",'w')
39 for i in matrix2.keys():
40 outputFile.write(str(matrix2[i])+"\n")
41 outputFile.close()
42 print "Second matrix done. %d count"% len(matrix2.keys())
43 contents = open(r"C:\Users\eborges\Box Sync\Client\Honeywell\Perligo\20150109_dedupeProj\originals_list.dat").readlines()
44
45 origList = []
46 for line in contents:
47 line = line.replace("\n","")
48 line = line.replace("\r","")
49 origList.append(line)
50 print "orig matrix done. %d count."% len(origList)
51
52 contents = open(r"C:\Users\eborges\Box Sync\Client\Honeywell\Perligo\20150109_dedupeProj\second_orig_list.dat").readlines()
53 origList2 = []
54 for line in contents:
55 line = line.replace("\n","")
56 line = line.replace("\r","")
57 origList2.append(line)
58 print "orig 2 matrix done. %d count."% len(origList2)
59
60 origList.sort(reverse=True)
61 origList2.sort(reverse=True)
62
63 outputFile = open(r"C:\Users\eborges\Box Sync\Client\Honeywell\Perligo\20150109_dedupeProj\finalReport_near.txt",'w')
64 for k in matrix2.keys():
65 group = matrix2[k]
66 group.sort()
67 parent = None
68 for orig in origList:
69 if orig in group:
70 parent = orig
71 for orig2 in origList2:
72 if orig2 in group:
73 parent = orig2
74 if parent:
75 outputFile.write(parent + "|")
76 for i in group:
77 if i == parent:
78 pass
79 else:
80 outputFile.write(i+",")
81 else:
82 outputFile.write(group[0]+"|")
83 for i in group[1:]:
84 outputFile.write(i+",")
85 outputFile.write('\n')
86
87 outputFile.close()
88