Skip to content

Instantly share code, notes, and snippets.

@chirdeeptomar
Created December 11, 2012 13:31
Show Gist options
  • Save chirdeeptomar/4258585 to your computer and use it in GitHub Desktop.
Save chirdeeptomar/4258585 to your computer and use it in GitHub Desktop.
Finds duplicate rows in excel 2007
from openpyxl import load_workbook
import os
dir_name = os.path.relpath(os.path.dirname(__file__))
file_name = os.path.join(dir_name, 'Data.xlsx')
unique_items = []
all_items = []
def add_to_list(item):
if item not in unique_items:
unique_items.append(item)
def find_duplicates(filename, has_header = False):
wb = load_workbook(filename = filename, use_iterators = True)
ws = wb.get_active_sheet() # ws is now an IterableWorksheet
for row in ws.iter_rows(): # it brings a new method: iter_rows()
if has_header:
has_header = False
continue
else:
item = ''
for cell in row:
if cell.column == 'A' or cell.column == 'F': #or cell.column == 'J' or cell.column == 'L'
item += (cell.internal_value + '#')
add_to_list(item)
all_items.append(item)
find_duplicates(file_name, True)
total_duplicates = 0
print("Total Items: ", len(all_items))
print("Total Unique Items: ", len(unique_items))
for x in unique_items:
if all_items.count(x) > 1:
total_duplicates +=1
print ("Duplicate Item: ", x)
print("Total Duplicates Found: ", total_duplicates)
@zooid
Copy link

zooid commented Jun 11, 2018

hmm, it didn't work for me, it said it had found 27 unique items, and in fact there were 31.
Thanks for sharing though

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment