Last active
September 3, 2020 03:48
-
-
Save NISH1001/92d3cf353c8484940da1d124bf9d5980 to your computer and use it in GitHub Desktop.
Sample trainng data for task 3 of ICDAR
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
('d5e1edd79d924caeafbd9f08dbfc71f9', | |
('COMMERCIAL INVOICE\n' | |
'\n' | |
'INVOICE NO :\n' | |
'1111111\n' | |
'DATE : 20-Sep-18\n' | |
'\n' | |
'INVOICE OF :\n' | |
'BAGS OF POLYETHYLENE\n' | |
'TENO/ 8138\n' | |
'SHIPPED ON :\n' | |
'YORK\n' | |
'\n' | |
'FROM :\n' | |
'Name Lastname\n' | |
'TO : NEW on Account and entire Risk of :\n' | |
'LOL :\n' | |
'\n' | |
'CENTRAL POLY ML LOL CORP\n' | |
'ETD : 24-Sep-18\n' | |
'2400 Some Place , PO Box xxxx\n' | |
'ETA : 17-Oct-18\n' | |
'\n' | |
'LLLNDX, NJ 07036, USA.\n' | |
'45 Days from the date of B/L\n' | |
':\n' | |
'DRAWN UNDER / THROUGH ORDER NO :\n' | |
'32430, 32507, 32674, 32562,32585\n' | |
'24-Sep-18\n' | |
'\n' | |
'HBL NO :\n' | |
'\n' | |
'ABCMD229\n' | |
'\n' | |
'DATE : RATE\n' | |
'\n' | |
'USD QTY\n' | |
'\n' | |
'USD RATE\n' | |
'COUNTRY OF ORIGIN\n' | |
'\n' | |
'ITEM NO\n' | |
'\n' | |
'DESCRIPTION\n' | |
'\n' | |
'QTY\n' | |
'\n' | |
'AMT CIF USD\n' | |
'\n' | |
'USD/ CASE\n' | |
'\n' | |
"40'\n" | |
'X 1 OOLU8652131\n' | |
'\n' | |
'221660N3M\n' | |
'BAGS OF POLYETHYLENE\n' | |
'806\n' | |
'\n' | |
'$\n' | |
'\n' | |
'11.820$\n' | |
'\n' | |
'9,526.92\n' | |
'\n' | |
': SIZE 515 X 320 X 78 MM\n' | |
'\n' | |
': PO NO 32430\n' | |
'\n' | |
'111112434M\n' | |
'BAGS OF POLYETHYLENE\n' | |
'205\n' | |
'\n' | |
'$\n' | |
'\n' | |
'16.745 $\n' | |
'\n' | |
'3,432.73\n' | |
'\n' | |
'505 X 265 X 135 MM\n' | |
'\n' | |
': SIZE : PO NO 32507\n' | |
'\n' | |
'231748N3MM\n' | |
'\n' | |
'BAGS OF POLYETHYLENE\n' | |
'1000\n' | |
'\n' | |
'$\n' | |
'\n' | |
'11.408\n' | |
'\n' | |
'$\n' | |
'\n' | |
'11,408.00\n' | |
'\n' | |
'400 x 270 x 100 MM\n' | |
'\n' | |
'SIZE : : PO NO 32674\n' | |
'\n' | |
'3340KXXH/250\n' | |
'BAGS OF POLYETHYLENE\n' | |
'200\n' | |
'\n' | |
'$\n' | |
'\n' | |
'8.893\n' | |
'\n' | |
'$\n' | |
'\n' | |
'1,778.60\n' | |
'\n' | |
'INDIA\n' | |
'\n' | |
'380 X 230 X 150 MM\n' | |
'\n' | |
': SIZE : PONO 32562\n' | |
'\n' | |
'232048N3MM\n' | |
'BAGS OF POLYETHYLENE\n' | |
'$\n' | |
'\n' | |
'14.408\n' | |
'\n' | |
'$\n' | |
'\n' | |
'100.86\n' | |
'\n' | |
': 420 x 290 x 110 MM\n' | |
'\n' | |
'SIZE 32430\n' | |
'\n' | |
': PO NO 232048N3MR\n' | |
'BAGS OF POLYETHYLENE\n' | |
'350\n' | |
'\n' | |
'$\n' | |
'\n' | |
'11.144\n' | |
'\n' | |
'$\n' | |
'\n' | |
'3,900.40\n' | |
'\n' | |
'480 X 285 X 105 MM\n' | |
'\n' | |
': SIZE : PONO 32585\n' | |
'\n' | |
'TOTAL\n' | |
'\n' | |
'2568\n' | |
'\n' | |
'30,147.50\n' | |
'\n' | |
'TOTAL USD CIF\n' | |
'\n' | |
'$\n' | |
'\n' | |
'30,147.50\n' | |
'\n' | |
'CIF USD THIRTY THOUSAND ONE HUNDRED FORTY SEVEN DOLLARS & CENTS FIFTY ONLY\n' | |
'\n' | |
':\n' | |
'\n' | |
'AMOUNT IN WORDS Name of Beneficiary :\n' | |
'\n' | |
'Drip Capital Inc\n' | |
'For LUCRO\n' | |
'\n' | |
'Bank Name :\n' | |
'Silicon Valley Bank\n' | |
'\n' | |
'This Invoice has been sold to LOLXXXX\n' | |
'For M/s. LLLLLXXX\n' | |
'\n' | |
'Capital Inc; 555 Bryant Street,#356\n' | |
'\n' | |
'3003 Tasman Drive, Santa Clara, CA 95054, USA.\n' | |
':\n' | |
'Palo Alto, CA 94301, USA\n' | |
'\n' | |
"Bank Address Bank's Routing Number :\n" | |
'\n' | |
'(ABA #) 112XXX982X\n' | |
'Swift Code :\n' | |
'SVBKUS6S\n' | |
'Authouse 3302198017\n' | |
'\n' | |
':\n' | |
'Account Number PLEASE PAY "DRIP CAPITAL"\n' | |
'\n' | |
'nator\n' | |
'\n' | |
'ON OUR ACCOUNT\n' | |
'\n' | |
'Authorised Signatory\n' | |
'\n' | |
'E. & O.E.\n' | |
'\n' | |
': NUMBER 2459BCKJDFLKJA33\n' | |
'\n' | |
'0312007949\n' | |
'\n' | |
': IEC CODE GST Goregaon Road ' | |
'Aashir HEAD OFFICE (W),\n' | |
'\n' | |
': INDIA. 4XXXX, Mumbai - Tel 2XXXXX / +91-222222222 : Website : ' | |
'www.example.com\n' | |
'\n' | |
'- Gujarat\n' | |
'\n' | |
'Branch Gujarat :', | |
array([10, 10, 10, ..., 10, 10, 10]))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def data_to_text2(df): | |
"""Convert google's dataframe to plain text conserving blocks and lines.""" | |
text = "" | |
labels = [] | |
for row in df.itertuples(): | |
text += row.Text | |
labels.extend([row.label_encoded for i in range(len(row.Text))]) | |
if row.space_type == 0: | |
continue | |
elif row.space_type in [1, 2]: | |
text += " " | |
labels.append(row.label_encoded) | |
elif row.space_type in [3, 4]: | |
text += "\n" | |
labels.append(row.label_encoded) | |
else: | |
text += "\n\n" | |
labels.append(row.label_encoded) | |
labels.append(row.label_encoded) | |
text = text.strip() | |
return text, np.array(labels[:len(text)]).flatten() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment