| 1 |
## Lee-Mercer.
|
| 2 |
## taking an original q-mobile delivery, hashing it, taking a new deliverable, hashing that
|
| 3 |
## for the purposes of comparing both to pull out the new stuff only.
|
| 4 |
|
| 5 |
>>> contents = open(r"\\sas22\sas22\35081\Inbound\08\103950\load_file.txt").readlines()
|
| 6 |
>>> contents[0]
|
| 7 |
'\xc3\xbeForensic Item Number\xc3\xbe\x14\xc3\xbeCustodian - Single Choice\xc3\xbe\x14\xc3\xbeDocID\xc3\xbe\x14\xc3\xbeGroup Identifier\xc3\xbe\x14\xc3\xbeBegAttach\xc3\xbe\x14\xc3\xbeEndAttach\xc3\xbe\x14\xc3\xbeMessage Thread ID\xc3\xbe\x14\xc3\xbeMobile Duplicate ID\xc3\xbe\x14\xc3\xbeChat Parties\xc3\xbe\x14\xc3\xbeParties_QD\xc3\xbe\x14\xc3\xbeParties by Group_QD\xc3\xbe\x14\xc3\xbeDate/Time Chat Start\xc3\xbe\x14\xc3\xbeDate/Time Chat End\xc3\xbe\x14\xc3\xbeDate/Time Mobile Event\xc3\xbe\x14\xc3\xbeDate/Time Message Delivered\xc3\xbe\x14\xc3\xbeDate/Time Message Read\xc3\xbe\x14\xc3\xbeMobile Folder\xc3\xbe\x14\xc3\xbeMobile To\xc3\xbe\x14\xc3\xbeMobile From\xc3\xbe\x14\xc3\xbeText Message Subject\xc3\xbe\x14\xc3\xbeHas Message Attachment\xc3\xbe\x14\xc3\xbeIs Message Attachment\xc3\xbe\x14\xc3\xbeFile Name\xc3\xbe\x14\xc3\xbeFile Size\xc3\xbe\x14\xc3\xbeDocument Extension\xc3\xbe\x14\xc3\xbeDate/Time Created\xc3\xbe\x14\xc3\xbeDate/Time Last Modified\xc3\xbe\x14\xc3\xbeAttachment MD5\xc3\xbe\x14\xc3\xbeMessage Read Status\xc3\xbe\x14\xc3\xbeMobile Deleted or Intact\xc3\xbe\x14\xc3\xbeChat Deleted\xc3\xbe\x14\xc3\xbeMessage Type\xc3\xbe\x14\xc3\xbeMessage SubType\xc3\xbe\x14\xc3\xbeCall Duration\xc3\xbe\x14\xc3\xbeMobile Country Code\xc3\xbe\x14\xc3\xbeMobile Video Call\xc3\xbe\x14\xc3\xbeQMobile Time Zone Field\xc3\xbe\x14\xc3\xbeExtracted Text\xc3\xbe\x14\xc3\xbeNative File Path\xc3\xbe\x14\xc3\xbeParent Message Type\xc3\xbe\x14\xc3\xbeMobile Source File Name\xc3\xbe\x14\xc3\xbeMobile Extraction Source\xc3\xbe\x14\xc3\xbeMobile Event Direction\xc3\xbe\x14\xc3\xbeException Reason\xc3\xbe\x14\xc3\xbeTime Zone Field\xc3\xbe\x14\xc3\xbeRelativity Native Time Zone Offset\xc3\xbe\n'
|
| 8 |
>>> contents = contents[1:]
|
| 9 |
>>> origMatrix = {}
|
| 10 |
>>> delim = "\x14"
|
| 11 |
|
| 12 |
>>> import hashlib
|
| 13 |
|
| 14 |
>>> for line in contents:
|
| 15 |
... line = line.replace("\n","")
|
| 16 |
... line = line.split(delim)
|
| 17 |
... bates = line[2]
|
| 18 |
... if bates == line[4]:
|
| 19 |
... ## Parent found
|
| 20 |
... rawStringContents = "%s%s%s%s%s%s"% (line[6], line[13], line[17], line[18], line[16], line[37])
|
| 21 |
... hashValue = hashlib.md5(rawStringContents).hexdigest()
|
| 22 |
... origMatrix[bates] = hashValue
|
| 23 |
...
|
| 24 |
>>> len(origMatrix.keys())
|
| 25 |
62665
|
| 26 |
|
| 27 |
>>> colList = []
|
| 28 |
>>> for i in range(6,46):
|
| 29 |
... colList.append(i)
|
| 30 |
...
|
| 31 |
>>> colList
|
| 32 |
[6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45]
|
| 33 |
>>> colList.remove(38)
|
| 34 |
>>> colList.remove(37)
|
| 35 |
>>> origMatrix = {}
|
| 36 |
>>> for line in contents:
|
| 37 |
... line = line.replace("\n","")
|
| 38 |
... line = line.split(delim)
|
| 39 |
... bates = line[2]
|
| 40 |
... if bates == line[4]:
|
| 41 |
... ## Parent found
|
| 42 |
... textPath2 = line[37].replace("\xc3\xbe","")
|
| 43 |
... if textPath2:
|
| 44 |
... textContents = open(os.path.join(textPath,textPath2)).read()
|
| 45 |
... else:
|
| 46 |
... textContents = ""
|
| 47 |
... rawStringContents = ""
|
| 48 |
... for i in colList:
|
| 49 |
... rawStringContents = rawStringContents + line[i]
|
| 50 |
... rawStringContents = rawStringContents + textContents
|
| 51 |
... hashValue = hashlib.md5(rawStringContents).hexdigest()
|
| 52 |
... origMatrix[hashValue] = bates
|
| 53 |
...
|
| 54 |
>>> len(origMatrix.keys())
|
| 55 |
62606
|
| 56 |
>>> newMatrix = {}
|
| 57 |
>>> textPath = r"\\sas22\sas22\35081\Inbound\New\QMobile Chat"
|
| 58 |
>>> contents = open(r"\\sas22\sas22\35081\Inbound\New\QMobile Chat\load_file.txt").readlines()
|
| 59 |
>>> contents[0]
|
| 60 |
'\xc3\xbeForensic Item Number\xc3\xbe\x14\xc3\xbeCustodian - Single Choice\xc3\xbe\x14\xc3\xbeDocID\xc3\xbe\x14\xc3\xbeGroup Identifier\xc3\xbe\x14\xc3\xbeBegAttach\xc3\xbe\x14\xc3\xbeEndAttach\xc3\xbe\x14\xc3\xbeMessage Thread ID\xc3\xbe\x14\xc3\xbeMobile Duplicate ID\xc3\xbe\x14\xc3\xbeChat Parties\xc3\xbe\x14\xc3\xbeParties_QD\xc3\xbe\x14\xc3\xbeParties by Group_QD\xc3\xbe\x14\xc3\xbeDate/Time Chat Start\xc3\xbe\x14\xc3\xbeDate/Time Chat End\xc3\xbe\x14\xc3\xbeDate/Time Mobile Event\xc3\xbe\x14\xc3\xbeDate/Time Message Delivered\xc3\xbe\x14\xc3\xbeDate/Time Message Read\xc3\xbe\x14\xc3\xbeMobile Folder\xc3\xbe\x14\xc3\xbeMobile To\xc3\xbe\x14\xc3\xbeMobile From\xc3\xbe\x14\xc3\xbeText Message Subject\xc3\xbe\x14\xc3\xbeHas Message Attachment\xc3\xbe\x14\xc3\xbeIs Message Attachment\xc3\xbe\x14\xc3\xbeFile Name\xc3\xbe\x14\xc3\xbeFile Size\xc3\xbe\x14\xc3\xbeDocument Extension\xc3\xbe\x14\xc3\xbeDate/Time Created\xc3\xbe\x14\xc3\xbeDate/Time Last Modified\xc3\xbe\x14\xc3\xbeAttachment MD5\xc3\xbe\x14\xc3\xbeMessage Read Status\xc3\xbe\x14\xc3\xbeMobile Deleted or Intact\xc3\xbe\x14\xc3\xbeChat Deleted\xc3\xbe\x14\xc3\xbeMessage Type\xc3\xbe\x14\xc3\xbeMessage SubType\xc3\xbe\x14\xc3\xbeCall Duration\xc3\xbe\x14\xc3\xbeMobile Country Code\xc3\xbe\x14\xc3\xbeMobile Video Call\xc3\xbe\x14\xc3\xbeQMobile Time Zone Field\xc3\xbe\x14\xc3\xbeExtracted Text\xc3\xbe\x14\xc3\xbeNative File Path\xc3\xbe\x14\xc3\xbeParent Message Type\xc3\xbe\x14\xc3\xbeMobile Source File Name\xc3\xbe\x14\xc3\xbeMobile Extraction Source\xc3\xbe\x14\xc3\xbeMobile Event Direction\xc3\xbe\x14\xc3\xbeException Reason\xc3\xbe\x14\xc3\xbeTime Zone Field\xc3\xbe\x14\xc3\xbeRelativity Native Time Zone Offset\xc3\xbe\n'
|
| 61 |
>>> contents = contents[1:]
|
| 62 |
>>> for line in contents:
|
| 63 |
... line = line.replace("\n","")
|
| 64 |
... line = line.split(delim)
|
| 65 |
... bates = line[2]
|
| 66 |
... if bates == line[4]:
|
| 67 |
... ## Parent found
|
| 68 |
... textPath2 = line[37].replace("\xc3\xbe","")
|
| 69 |
... if textPath2:
|
| 70 |
... textContents = open(os.path.join(textPath,textPath2)).read()
|
| 71 |
... else:
|
| 72 |
... textContents = ""
|
| 73 |
... rawStringContents = ""
|
| 74 |
... for i in colList:
|
| 75 |
... rawStringContents = rawStringContents + line[i]
|
| 76 |
... rawStringContents = rawStringContents + textContents
|
| 77 |
... hashValue = hashlib.md5(rawStringContents).hexdigest()
|
| 78 |
... newMatrix[hashValue] = bates
|
| 79 |
|
| 80 |
|
| 81 |
>>> len(newMatrix.keys())
|
| 82 |
91752
|
| 83 |
>>> deltaList = []
|
| 84 |
|
| 85 |
>>> nonDeltaList = []
|
| 86 |
>>> oldHashList = origMatrix.keys()
|
| 87 |
>>> newHashList = newMatrix.keys()
|
| 88 |
>>> for hashVal in newHashList:
|
| 89 |
... if hashVal in oldHashList:
|
| 90 |
... nonDeltaList.append(newMatrix[hashVal])
|
| 91 |
... else:
|
| 92 |
... deltaList.append(newMatrix[hashVal])
|
| 93 |
...
|
| 94 |
>>> len(deltaList)
|
| 95 |
47180
|
| 96 |
>>> len(nonDeltaList)
|
| 97 |
44572
|
| 98 |
|
| 99 |
|
| 100 |
|
| 101 |
>>> deltaOutPutFile = open(r"\\sas22\sas22\35081\Inbound\XFOR\Ira Lee QM\UniqueFromOrig-Load.dat",'w')
|
| 102 |
>>> duplicatesOutPutFile = open(r"\\sas22\sas22\35081\Inbound\XFOR\Ira Lee QM\DuplicatesFromOrig-Load.dat",'w')
|
| 103 |
>>> for line in contents:
|
| 104 |
... newLine = line.replace("\n","")
|
| 105 |
... newLine = newLine.split(delim)
|
| 106 |
... if newLine[3] in deltaList:
|
| 107 |
... deltaOutPutFile.write(line)
|
| 108 |
... else:
|
| 109 |
... duplicatesOutPutFile.write(line)
|
| 110 |
...
|
| 111 |
>>> deltaOutPutFile.close()
|
| 112 |
>>> duplicatesOutPutFile.close() |