Last active
April 19, 2018 09:48
-
-
Save mhbeals/415df6eb58b434cff7d5b8002577a2ce to your computer and use it in GitHub Desktop.
A script to download all (as of April 2018) newspaper articles from Trove
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib | |
import requests | |
from lxml import etree | |
import xml.etree.ElementTree as ET | |
# Use one of these lists or visit https://gist.github.com/mhbeals/1ad7cd04ca0f8fd74e12f6151664873e for full listing | |
list_pre_1840 = [3,1046,1047,4,5,50,273,22,23,76,1230,19,24,1282,1235,693,95,272,1236,695,696,1237,37,525,944,6,1233,1239,869,694,945,1240,1242,66,170,1013,1238,40,1142,1232,1241,1329,935,171,96,1137,936,1243,41,20,48,1231,984] | |
list_1840 = [1330,1030,1331,986,181,21,1037,339,292,1014,1015,867,1336,1027,1012,1022,1036,74,1026,1025,1039,35,110,8,1033,1028,1035,190,821,1040,1034,78,1020,1018,1023,1038,1031,1021,690,178,1234,1011,172,58,1029,1016,285,160,1024,1004,1017,1138,937,1019,1032,941,18,14,94,863,864,987,284,26,54,364,13,1100,938,939,55] | |
list_1850 = [1244,314,464,67,1139,277,56,1041,283,174,805,107,1339,582,31,809,326,1245,948,213,180,874,346,189,484,262,161,669,985,706,193,685,162,496,959,32,1324,33,163,1246,257,1247,7,287,1053,382,380,164,558,1248,104,1054,65,381,353] | |
list_1860 = [365,415,927,478,10,697,808,394,57,479,731,278,1190,958,376,276,15,150,92,1056,108,377,940,319,807,263,510,1249,239,360,371,264,27,361,1092,756,103,366,16,83,460,28,399,659,185,406,391,125,126,392,1068,865,383,1062,42,260,373,1258,848,246,354,1093,1250,1259,386,259,835,1340,1260,1263,684,84,72,114,288,487,792,838,153,527,412,839,118,966,311,953,372,1334,1050,1148,223,621,508,232,1261,241] | |
list_1870 = [127,52,567,954,1264,575,946,698,810,725,91,1266,222,950,803,428,840,1171,49,151,184,656,9,1149,355,258,463,29,282,512,1226,902,1332,1069,1145,1172,186,122,1150,374,413,1224,1057,715,720,891,70,356,955,474,81,951,109,719,43,208,639,1144,154,155,988,195,734,862,1265,77,123,1335,850,398,475,963,1262,1052,59,1173,1065,1128,1167,964,701,631,1296,265,261,137,30,712] | |
list_1880 = [483,281,990,1146,714,947,368,834,248,367,289,177,416,1072,152,1154,471,855,503,804,405,334,225,209,1141,240,350,728,145,516,275,853,234,1270,338,231,80,742,521,866,965,134,1297,60,217,387,335,218,671,1096,624,991,522,235,1147,993,230,713,337,204,1177,1220,357,101,1298,135,196,501,388,499,655,1206,1042,132,1183,949,836,1074,201,200,1199,198,389,402,657,425,214,426,1178,341,1212,952,1200,215,199,194,252,228,133,1086,192,397,472,819,545,351,511,211,994,247,519,53,956,290,34,925,1048,906,111,105,244,826,64,825,565,650,205] | |
list_1890 = [191,648,71,824,207,960,716,1077,1051,117,270,1201,1075,806,124,518,1049,128,268,641,700,970,1269,745,885,129,647,847,675,266,1316,403,829,269,212,502,688,747,345,858,203,1070,271,130,842,1209,296,1076,140,431,492,188,643,493,309,699,138,197,730,384,1279,704,983,907,407,1222,99,530,1103,505,139,1338,1198,591,1165,1083,638,922,724,870,342,1203,291,450,488,385,447,1272,663,436,245,1277,846,424,73,528,506,873,845,340,274,817,1273,1202,843,1251,737,175,79,1161,961,116,1204,98,120,721,886,489,877,469,702,490,645,1176,498,459,736,969,430,141,975,1185,136,348,1268,142,1120,486,1091,393,480,362,115,881,636,1153,880,452,448,904,396,1160,1006,121,660,1278,887,976,176] | |
list_1900s = [68,453,691,748,683,379,434,723,1159,143,596,841,1271,658,202,755,741,1317,437,942,692,980,467,726,981,876,1152,497,1214,526,1252,477,1166,717,89,1067,297,523,593,419,336,494,735,524,466,1299,1349,458,705,1218,93,744,547,1309,179,1253,514,888,1184,553,446,146,680,908,455,1181,427,982,286,1194,1318,1097,1058,414,739,1205,445,113,854,1163,390,1195,992,923,500,1219,823,224,517,729,1043,979,1281,1079,644,973,1192,1081,254,957,443,473,1117,1223,971,837,495,451,898,465,1322,532,689,857,1089,1125,168,173,795,169,1059,859,974,989,1105,1193,97,814,515,816,924,62,86,633,1227,156,860,1158,897,1162,206,903,1064,635,1168,674,378,1066,1156,444,1179,61,1300,977,482,1157,1228,1098,818,743,879,667,395,1088,491,1063,681,1180,504,709,1301,159,646,738,820,1182,481,899,687,812,420,470,872,1225,1143,210,896,1170,358,1197,928,330,238,607,1155,421,423,422,637,1221,708,740,703,1333,75,654,828,1078,815,830,967,529,1118,1127,893,652,352,653,343,456,454,733,1116,564,401,1164,918,1087,746,449,429,1007,534,539,542,312,751,303,776,602,606,242,293,546,322,576,317,583,306,307,794,1325,768,618,227,934,978,754,750,544,763,550,551,774,324,573,574,781,313,598,600,318,789,798,608,229,793,613,302,933,932,298,331,605,300,249,315,541,316,563,321,762,570,752,753,777,587,588,592,601,604,609,320,536,560,766,577,578,581,771,597,299,301,537,548,556,561,568,584,614,616,785,554,555,559,562,569,571,580,773,791,585,611,615,617,783,619,622,759,790,295,535,761,333,589,304,594,328,599,787,538,586,603,780,612,243,797,507,682,782,1186,889,620,323,1129,308,784,626,250,796,765,1084,610,800,329,557,1169,359,661,827,707,758,883,87,468,770,788,549,757,595,566,332,772,769,767,779,649,1130,440,878,1005,438,305,590,811,931,1131,409,485,552,844,1133,786,882,1085,775,131,686,930,579,325,439,513,678,1124,533,813,100,929,432,1126,310,894,1107,764,147,327,441,1123,1302,294,926,1080,540,1071,349,651,861,799,760,1115,572,822,457,749,543,1110,509,778,679,900,46,634,347,892,1229,623,404,1119,158,919,344,1114,1111,255,418,909,995,433,236,1313,1009,25,253,997,1256,267,1008,148,875,1094,106,102,1101,1280,1003,1010,461,968,884,628,1274,531,1284,917,1315,251,676,1106,1257,895,868,943,417,69,435,119,1090,625,166,165,1319,913,11,1207,375,167,851,36,370,901,1113,1275,1314,632,664,1060,1151,1276,256,90,220,911,672,1208,914,410,1320,1312,1134,187,718,915,916,831,630,1102,912,1073,45,833,1122,44,920,279,1310,832,710,1196,962,662,112,1321,12,668,1121,1189,722,1213,1174,673,1311,801,852,183,520,216,51,999,1188,1305,1285,910,237,1002,727,629,149,921,972,369,1099,1109,627,1061,998,47,711,1000,1001,996,411,144,400,476,1112,1283,1095,182,1303,363,732,665,63,1055,221,1108,408,38,802,1307,157,233,280,1210,670,442,666,871,890,462,1044,1082,1304,1211,1341,1323,1187,39,1306,1191,1175,1346,1288,1294,1286,1289,1308,1104,1290,1292,1326,1295,1132,1327,1328,856,1343,1291,1045,1344,1345,905] | |
# Enter your api key here | |
apikey = "" | |
# Goes through every title in that decade (as of April 2018) and gets max number count. Change list to correct decade or custom list | |
for item in list_pre_1840: | |
requestURL = "http://api.trove.nla.gov.au/result?&q=+&zone=newspaper&l-title=" + str(item) + "&s=1&n=1&sortby=dateasc&key=" + apikey | |
response = requests.get(requestURL) | |
# Gets XML data to find current max article count | |
newtext = response.text | |
data = newtext.encode('ascii', 'ignore').decode('ascii') | |
with open('temp.xml', 'w') as f: | |
f.write(data) | |
# Creates variable of current max article count | |
root = ET.parse('temp.xml').getroot() | |
for records in root.iter('records'): | |
max_articles = records.attrib['total'] | |
# Goes through each page of that title and collects XML | |
number = 0 | |
while number < int(max_articles): | |
# Make sure to change your API key at the end of the URL | |
urltext = "http://api.trove.nla.gov.au/result?&q=+&zone=newspaper&include=articletext&l-title=" + str(item) + "&s=" + str(number) + "&n=100&sortby=dateasc&key=pcn9hh0qehlhv0n2" | |
response = requests.get(urltext) | |
newtext = response.text | |
data = newtext.encode('ascii', 'ignore').decode('ascii') | |
with open(str(number) + '.xml', 'w') as f: | |
f.write(data) | |
print(str(number) + "-" + str(number+100) + " out of " + str(max_articles) + " collected\n") | |
number = number+100 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment