Created
December 11, 2012 13:31
-
-
Save chirdeeptomar/4258585 to your computer and use it in GitHub Desktop.
Finds duplicate rows in excel 2007
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from openpyxl import load_workbook | |
import os | |
dir_name = os.path.relpath(os.path.dirname(__file__)) | |
file_name = os.path.join(dir_name, 'Data.xlsx') | |
unique_items = [] | |
all_items = [] | |
def add_to_list(item): | |
if item not in unique_items: | |
unique_items.append(item) | |
def find_duplicates(filename, has_header = False): | |
wb = load_workbook(filename = filename, use_iterators = True) | |
ws = wb.get_active_sheet() # ws is now an IterableWorksheet | |
for row in ws.iter_rows(): # it brings a new method: iter_rows() | |
if has_header: | |
has_header = False | |
continue | |
else: | |
item = '' | |
for cell in row: | |
if cell.column == 'A' or cell.column == 'F': #or cell.column == 'J' or cell.column == 'L' | |
item += (cell.internal_value + '#') | |
add_to_list(item) | |
all_items.append(item) | |
find_duplicates(file_name, True) | |
total_duplicates = 0 | |
print("Total Items: ", len(all_items)) | |
print("Total Unique Items: ", len(unique_items)) | |
for x in unique_items: | |
if all_items.count(x) > 1: | |
total_duplicates +=1 | |
print ("Duplicate Item: ", x) | |
print("Total Duplicates Found: ", total_duplicates) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
hmm, it didn't work for me, it said it had found 27 unique items, and in fact there were 31.
Thanks for sharing though