Skip to content

Instantly share code, notes, and snippets.

@asdf913
Created February 2, 2025 09:56
Show Gist options
  • Save asdf913/67897b8eb4769d3004edc48098449611 to your computer and use it in GitHub Desktop.
Save asdf913/67897b8eb4769d3004edc48098449611 to your computer and use it in GitHub Desktop.
Extract all Japanese adverb from https://jlptsensei.com/complete-japanese-adverbs-list/ and write the list to a XLSX spreadsheet. Each sheet represent the JLPT examination level
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Collection;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.codec.binary.StringUtils;
import org.apache.commons.collections4.IterableUtils;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
public class JlptAdverbList {
public static void main(String[] args) throws MalformedURLException, IOException, URISyntaxException {
//
final String url = "https://jlptsensei.com/complete-japanese-adverbs-list/";
//
final Document document = Jsoup.parse(new URI(url).toURL(), 0);
//
final File file = new File("jlpt-advert.xlsx");
//
System.out.println(file.getAbsolutePath());
//
try (final Workbook workbook = new XSSFWorkbook(); final OutputStream os = new FileOutputStream(file)) {
//
final Sheet sheet = workbook.createSheet("all");
//
write(sheet, url, true);
//
final Iterable<Element> elements = document != null ? document.select("a.page-numbers") : null;
//
Element element = null;
//
for (int i = 0; i < IterableUtils.size(elements); i++) {
//
if ((element = IterableUtils.get(elements, i)) == null || element.hasAttr("next")) {
//
continue;
//
} // if
//
write(sheet, element.attr("href"), false);
//
} // for
//
Collection<Element> as = getElementsByTag(document, "a");
//
as = as != null ? as.stream()
.filter(x -> x != null && matches(text(x), "^JLPT\\s+(N\\d+)\\s+adverbs\\s+list$")
&& StringUtils.equals(x.attr("href"), absUrl(x, "href")))
.collect(Collectors.toList()) : null;
//
Pattern pattern = null;
//
Matcher matcher = null;
//
Element a = null;
//
for (int i = 0; i < IterableUtils.size(as)
&& (pattern = ObjectUtils.getIfNull(pattern,
() -> Pattern.compile("^JLPT\\s+(N\\d+)\\s+adverbs\\s+list$"))) != null
&& (matcher = pattern.matcher(text(a = IterableUtils.get(as, i)))) != null && matcher.matches()
&& matcher.groupCount() > 0; i++) {
//
write(workbook.createSheet(matcher.group(1)), absUrl(a, "href"), true);
//
} // for
//
workbook.write(os);
//
} // try
//
}
private static String absUrl(final Node instance, final String attributeKey) {
return instance != null ? instance.absUrl(attributeKey) : null;
}
private static boolean matches(final String instance, final String regex) {
return instance != null && instance.matches(regex);
}
private static void write(final Sheet sheet, final String url, final boolean header)
throws MalformedURLException, IOException, URISyntaxException {
//
write(sheet,
testAndApply(x -> IterableUtils.size(x) == 1,
getElementsByTag(Jsoup.parse(url != null ? new URI(url).toURL() : null, 0), "table"),
x -> IterableUtils.get(x, 0), null),
header);
//
}
private static void write(final Sheet sheet, final Element table, final boolean header) {
//
Iterable<Element> elements = children(
IterableUtils.size(elements = getElementsByTag(table, "thead")) == 1 ? IterableUtils.get(elements, 0)
: null);
//
final Element tr = IterableUtils.size(elements) == 1 ? IterableUtils.get(elements, 0) : null;
//
Row row = null;
//
int maxCellCount = 0;
//
if (tr != null && (row = createRow(sheet, getPhysicalNumberOfRows(sheet))) != null && header) {
//
for (int i = 0; i < tr.childrenSize(); i++) {
//
setCellValue(row.createCell(row.getPhysicalNumberOfCells()), text(tr.child(i)));
//
} // for
//
maxCellCount = Math.max(maxCellCount, tr.childrenSize());
//
} // if
//
elements = children(
IterableUtils.size(elements = getElementsByTag(table, "tbody")) == 1 ? IterableUtils.get(elements, 0)
: null);
//
Iterable<Element> trs = null;
//
for (int i = 0; i < IterableUtils.size(elements)
&& (trs = children(IterableUtils.get(elements, i))) != null; i++) {
//
if (IterableUtils.size(trs) != maxCellCount
|| (row = createRow(sheet, sheet.getLastRowNum() + 1)) == null) {
//
continue;
//
} // if
//
for (int j = 0; j < IterableUtils.size(trs); j++) {
//
setCellValue(row.createCell(row.getPhysicalNumberOfCells()), text(IterableUtils.get(trs, j)));
//
} // for
//
} // for
//
}
private static <T, R> R testAndApply(final Predicate<T> predicate, final T value, final Function<T, R> functionTrue,
final Function<T, R> functionFalse) {
return test(predicate, value) ? apply(functionTrue, value) : apply(functionFalse, value);
}
private static <T> boolean test(final Predicate<T> instance, final T value) {
return instance != null && instance.test(value);
}
private static <T, R> R apply(final Function<T, R> instance, final T value) {
return instance != null ? instance.apply(value) : null;
}
private static Elements getElementsByTag(final Element instance, final String tagName) {
return instance != null ? instance.getElementsByTag(tagName) : null;
}
private static String text(final Element instance) {
return instance != null ? instance.text() : null;
}
private static void setCellValue(final Cell instance, final String value) {
if (instance != null) {
instance.setCellValue(value);
}
}
private static Elements children(final Element instance) {
return instance != null ? instance.children() : null;
}
private static int getPhysicalNumberOfRows(final Sheet instance) {
return instance != null ? instance.getPhysicalNumberOfRows() : 0;
}
private static Row createRow(final Sheet instance, final int rownum) {
return instance != null ? instance.createRow(rownum) : null;
}
}
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.18.3</version>
</dependency>
<!--https://mvnrepository.com/artifact/org.apache.commons/commons-collections4 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
<version>4.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.4.0</version>
</dependency>
@asdf913
Copy link
Author

asdf913 commented Feb 2, 2025

jlpt-advert xlsx
Right click the link and then download the file.
After the download complete, please change the file extension from "png" to "xlsx"

@asdf913
Copy link
Author

asdf913 commented Feb 2, 2025

MP900049546
Download the file
Then, visit https://georgeom.net/StegOnline
image
Click Extract Files/Data
image
Check checkbox (7R) and then click Go button
image
Click Download Extracted Data button
image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment