Created
February 2, 2025 09:56
-
-
Save asdf913/67897b8eb4769d3004edc48098449611 to your computer and use it in GitHub Desktop.
Extract all Japanese adverb from https://jlptsensei.com/complete-japanese-adverbs-list/ and write the list to a XLSX spreadsheet. Each sheet represent the JLPT examination level
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.File; | |
import java.io.FileOutputStream; | |
import java.io.IOException; | |
import java.io.OutputStream; | |
import java.net.MalformedURLException; | |
import java.net.URI; | |
import java.net.URISyntaxException; | |
import java.util.Collection; | |
import java.util.function.Function; | |
import java.util.function.Predicate; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
import java.util.stream.Collectors; | |
import org.apache.commons.codec.binary.StringUtils; | |
import org.apache.commons.collections4.IterableUtils; | |
import org.apache.commons.lang3.ObjectUtils; | |
import org.apache.poi.ss.usermodel.Cell; | |
import org.apache.poi.ss.usermodel.Row; | |
import org.apache.poi.ss.usermodel.Sheet; | |
import org.apache.poi.ss.usermodel.Workbook; | |
import org.apache.poi.xssf.usermodel.XSSFWorkbook; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.nodes.Node; | |
import org.jsoup.select.Elements; | |
public class JlptAdverbList { | |
public static void main(String[] args) throws MalformedURLException, IOException, URISyntaxException { | |
// | |
final String url = "https://jlptsensei.com/complete-japanese-adverbs-list/"; | |
// | |
final Document document = Jsoup.parse(new URI(url).toURL(), 0); | |
// | |
final File file = new File("jlpt-advert.xlsx"); | |
// | |
System.out.println(file.getAbsolutePath()); | |
// | |
try (final Workbook workbook = new XSSFWorkbook(); final OutputStream os = new FileOutputStream(file)) { | |
// | |
final Sheet sheet = workbook.createSheet("all"); | |
// | |
write(sheet, url, true); | |
// | |
final Iterable<Element> elements = document != null ? document.select("a.page-numbers") : null; | |
// | |
Element element = null; | |
// | |
for (int i = 0; i < IterableUtils.size(elements); i++) { | |
// | |
if ((element = IterableUtils.get(elements, i)) == null || element.hasAttr("next")) { | |
// | |
continue; | |
// | |
} // if | |
// | |
write(sheet, element.attr("href"), false); | |
// | |
} // for | |
// | |
Collection<Element> as = getElementsByTag(document, "a"); | |
// | |
as = as != null ? as.stream() | |
.filter(x -> x != null && matches(text(x), "^JLPT\\s+(N\\d+)\\s+adverbs\\s+list$") | |
&& StringUtils.equals(x.attr("href"), absUrl(x, "href"))) | |
.collect(Collectors.toList()) : null; | |
// | |
Pattern pattern = null; | |
// | |
Matcher matcher = null; | |
// | |
Element a = null; | |
// | |
for (int i = 0; i < IterableUtils.size(as) | |
&& (pattern = ObjectUtils.getIfNull(pattern, | |
() -> Pattern.compile("^JLPT\\s+(N\\d+)\\s+adverbs\\s+list$"))) != null | |
&& (matcher = pattern.matcher(text(a = IterableUtils.get(as, i)))) != null && matcher.matches() | |
&& matcher.groupCount() > 0; i++) { | |
// | |
write(workbook.createSheet(matcher.group(1)), absUrl(a, "href"), true); | |
// | |
} // for | |
// | |
workbook.write(os); | |
// | |
} // try | |
// | |
} | |
private static String absUrl(final Node instance, final String attributeKey) { | |
return instance != null ? instance.absUrl(attributeKey) : null; | |
} | |
private static boolean matches(final String instance, final String regex) { | |
return instance != null && instance.matches(regex); | |
} | |
private static void write(final Sheet sheet, final String url, final boolean header) | |
throws MalformedURLException, IOException, URISyntaxException { | |
// | |
write(sheet, | |
testAndApply(x -> IterableUtils.size(x) == 1, | |
getElementsByTag(Jsoup.parse(url != null ? new URI(url).toURL() : null, 0), "table"), | |
x -> IterableUtils.get(x, 0), null), | |
header); | |
// | |
} | |
private static void write(final Sheet sheet, final Element table, final boolean header) { | |
// | |
Iterable<Element> elements = children( | |
IterableUtils.size(elements = getElementsByTag(table, "thead")) == 1 ? IterableUtils.get(elements, 0) | |
: null); | |
// | |
final Element tr = IterableUtils.size(elements) == 1 ? IterableUtils.get(elements, 0) : null; | |
// | |
Row row = null; | |
// | |
int maxCellCount = 0; | |
// | |
if (tr != null && (row = createRow(sheet, getPhysicalNumberOfRows(sheet))) != null && header) { | |
// | |
for (int i = 0; i < tr.childrenSize(); i++) { | |
// | |
setCellValue(row.createCell(row.getPhysicalNumberOfCells()), text(tr.child(i))); | |
// | |
} // for | |
// | |
maxCellCount = Math.max(maxCellCount, tr.childrenSize()); | |
// | |
} // if | |
// | |
elements = children( | |
IterableUtils.size(elements = getElementsByTag(table, "tbody")) == 1 ? IterableUtils.get(elements, 0) | |
: null); | |
// | |
Iterable<Element> trs = null; | |
// | |
for (int i = 0; i < IterableUtils.size(elements) | |
&& (trs = children(IterableUtils.get(elements, i))) != null; i++) { | |
// | |
if (IterableUtils.size(trs) != maxCellCount | |
|| (row = createRow(sheet, sheet.getLastRowNum() + 1)) == null) { | |
// | |
continue; | |
// | |
} // if | |
// | |
for (int j = 0; j < IterableUtils.size(trs); j++) { | |
// | |
setCellValue(row.createCell(row.getPhysicalNumberOfCells()), text(IterableUtils.get(trs, j))); | |
// | |
} // for | |
// | |
} // for | |
// | |
} | |
private static <T, R> R testAndApply(final Predicate<T> predicate, final T value, final Function<T, R> functionTrue, | |
final Function<T, R> functionFalse) { | |
return test(predicate, value) ? apply(functionTrue, value) : apply(functionFalse, value); | |
} | |
private static <T> boolean test(final Predicate<T> instance, final T value) { | |
return instance != null && instance.test(value); | |
} | |
private static <T, R> R apply(final Function<T, R> instance, final T value) { | |
return instance != null ? instance.apply(value) : null; | |
} | |
private static Elements getElementsByTag(final Element instance, final String tagName) { | |
return instance != null ? instance.getElementsByTag(tagName) : null; | |
} | |
private static String text(final Element instance) { | |
return instance != null ? instance.text() : null; | |
} | |
private static void setCellValue(final Cell instance, final String value) { | |
if (instance != null) { | |
instance.setCellValue(value); | |
} | |
} | |
private static Elements children(final Element instance) { | |
return instance != null ? instance.children() : null; | |
} | |
private static int getPhysicalNumberOfRows(final Sheet instance) { | |
return instance != null ? instance.getPhysicalNumberOfRows() : 0; | |
} | |
private static Row createRow(final Sheet instance, final int rownum) { | |
return instance != null ? instance.createRow(rownum) : null; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> | |
<dependency> | |
<groupId>org.jsoup</groupId> | |
<artifactId>jsoup</artifactId> | |
<version>1.18.3</version> | |
</dependency> | |
<!--https://mvnrepository.com/artifact/org.apache.commons/commons-collections4 --> | |
<dependency> | |
<groupId>org.apache.commons</groupId> | |
<artifactId>commons-collections4</artifactId> | |
<version>4.4</version> | |
</dependency> | |
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml --> | |
<dependency> | |
<groupId>org.apache.poi</groupId> | |
<artifactId>poi-ooxml</artifactId> | |
<version>5.4.0</version> | |
</dependency> |
Download the file
Then, visit https://georgeom.net/StegOnline
Click Extract Files/Data
Check checkbox (7R) and then click Go button
Click Download Extracted Data button
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Right click the link and then download the file.
After the download complete, please change the file extension from "png" to "xlsx"