Skip to content

Instantly share code, notes, and snippets.

@nanzono
Created June 27, 2014 08:57
Show Gist options
  • Save nanzono/a3794b148d288552e8e5 to your computer and use it in GitHub Desktop.
Save nanzono/a3794b148d288552e8e5 to your computer and use it in GitHub Desktop.
check duplicate rows.
# coding:utf-8
import os
import hashlib
import sqlite3
target_path = './20140620/'
db_path = './logs.db'
file_output = './output.txt'
def store_input_file():
print "store_input_file"
sql_tmp = "INSERT INTO logs (id, digest, file, val) VALUES (%(id)s, '%(digest)s', '%(file)s', '%(val)s');"
db = sqlite3.connect(db_path)
cnt = 1
sql = ""
for dpath,dnames,fnames in os.walk(target_path):
for fname in fnames:
fi = open(os.path.join(dpath, fname), "rb")
for row in fi:
r_hash = hashlib.sha256(row).hexdigest()
s_row = row[:-1]
#s_row = row
sql += sql_tmp %{"id": cnt, "digest": r_hash, "file": fname, "val": s_row}
if cnt % 10 == 0:
db.executescript(sql)
db.commit()
sql = ""
cnt += 1
db.executescript(sql)
db.commit()
def check_duplicated():
print "check_duplicated"
sql = """
SELECT ls.id, ls.digest,ls.file,ls.val, m.cnt FROM logs ls
INNER JOIN (SELECT digest, COUNT(*) cnt FROM logs GROUP BY digest HAVING cnt > 1) m
ON ls.digest = m.digest
ORDER BY ls.digest
"""
db = sqlite3.connect(db_path)
cur = db.cursor()
cur.execute(sql)
fo = open(file_output, "wb")
for c in cur:
#print c
fo.write(str(c))
fo.write('\n')
fo.close()
def create_table():
print "create_table"
ddl = """
DROP TABLE IF EXISTS logs;
CREATE TABLE logs (
id INT
, digest TEXT
, file TEXT
, val TEXT
);
"""
db = sqlite3.connect(db_path)
db.executescript(ddl)
db.commit()
if __name__ == '__main__':
create_table()
store_input_file()
check_duplicated()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment