Last active
September 15, 2024 17:51
-
-
Save poontology/6f5eefc28268e36f877589f0768e782e to your computer and use it in GitHub Desktop.
Cleanup log of api_dump.sqlite from https://sukebei.nyaa.si/view/4045598
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ sqlite3 api_dump.sqlite | |
SQLite version 3.46.1 2024-08-13 09:16:08 | |
Enter ".help" for usage hints. | |
sqlite> select count(*) from gallery; | |
2378815 | |
sqlite> .changes on | |
sqlite> .timer on | |
sqlite> delete from gallery where category!='Manga' and category!='Doujinshi'; | |
Run Time: real 129.907 user 28.681102 sys 63.180995 | |
changes: 1489668 total_changes: 1489668 | |
sqlite> select count(*) from gallery; | |
889147 | |
-- only interested in original jp and eng translations | |
sqlite> select count(*),language from gallery where language not like "%'english'%" group by language order by count(*); | |
61|['rewrite'] | |
81|['speechless', 'text cleaned'] | |
... | |
712|['translated'] | |
803|['translated', 'ukrainian'] | |
839|['polish', 'translated'] | |
1323|['speechless'] | |
1361|['indonesian', 'translated'] | |
2847|['text cleaned'] | |
3067|['german', 'translated'] | |
4434|['korean'] | |
4497|['translated', 'vietnamese'] | |
4966|['italian', 'translated'] | |
7839|['chinese'] | |
8936|['thai', 'translated'] | |
10522|['french', 'translated'] | |
16601|['portuguese', 'translated'] | |
22512|['russian', 'translated'] | |
49605|['spanish', 'translated'] | |
80081|['korean', 'translated'] | |
136210|['chinese', 'translated'] | |
Run Time: real 0.882 user 0.612735 sys 0.268879 | |
sqlite> delete from gallery where male like "%'males only'%"; | |
Run Time: real 37.653 user 2.493469 sys 6.744719 | |
changes: 66360 total_changes: 1556028 | |
sqlite> delete from gallery where language is not null and language not like "%'english'%" and language!="['translated']" and language!="['speechless']" and language!="['text cleaned']"; | |
Run Time: real 61.197 user 7.647673 sys 16.797743 | |
changes: 335761 total_changes: 1891789 | |
sqlite> delete from gallery where female like "%'females only'%"; | |
Run Time: real 26.156 user 0.660319 sys 1.953976 | |
changes: 12780 total_changes: 1904569 | |
-- above reductions bring filesize from 1.7G -> 580M | |
-- something else to consider for removal | |
sqlite> select count(*) from gallery where male like "%'furry'%" and female like "%'furry'%"; | |
2251 | |
sqlite> select count(*) from gallery where male like "%'dilf'%"; | |
24355 | |
sqlite> select count(*) from gallery where rating < 3; | |
64636 | |
sqlite> select count(*) from gallery where rating!=0 and rating < 3; | |
61894 | |
-- huh? | |
sqlite> select count(*) from gallery where removed=1 and torrentcount>0; | |
55 | |
sqlite> select count(*) from gallery where removed=1 and torrents is not null; | |
24 | |
-- ongoing series etc with partial galleries, we only want the latest one | |
sqlite> select count(*),title from gallery group by title having count(*)>5 order by count(*) desc limit 5; | |
83|[dogado] Homo Sexience [Ongoing] | |
74|MusSoap [On Going] | |
70|[Korotsuke] Nekura Megane ♀ | |
58|[Kidouchi_Kon] GAME/DEATH | |
54|[nu-] Bibia Saikou ka yo! | |
Run Time: real 1.068 user 0.832136 sys 0.218259 | |
sqlite> .header on | |
-- titles within series can differ, use first_gid instead it points to first entry in series | |
sqlite> select count(*),datetime(min(posted),'auto') as oldest,datetime(max(posted),'auto') as newest,title from gallery where first_gid is not null group by first_gid having count(*)>5 order by count(*) desc limit 5; | |
count(*)|oldest|newest|title | |
84|2016-02-08 20:32:28|2017-10-05 20:25:32|[Insane] Love Parameter Ch.1-112 (English) | |
82|2014-03-19 13:19:25|2015-01-06 15:20:27|[dogado] Homo Sexience [Ongoing] | |
80|2016-09-05 01:49:21|2020-07-11 22:45:53|MusSoap [On Going] | |
66|2017-04-21 20:35:41|2019-02-18 22:18:50|[Worin] Brawling Go Ch.76-148 [English] [Ongoing] | |
64|2022-02-18 05:55:15|2024-01-20 02:35:15|[Himeno Mikan] Toki o Kakeru [Ongoing] | |
Run Time: real 0.283 user 0.126232 sys 0.156648 | |
-- count extra rows that share first_gid (we want to keep latest one but also delete where gid=first_gid which isn't counted here so # of deletable_rows is still accurate) | |
sqlite> select count(*) num_of_groups,sum(c) deletable_rows from (select count(*) c from gallery where first_gid is not null group by first_gid having count(*) > 1); | |
num_of_groups|deletable_rows | |
5875|24931 | |
Run Time: real 0.271 user 0.117217 sys 0.153546 | |
sqlite> begin; | |
sqlite> delete from gallery where first_gid is not null and gid not in (select max(gid) from gallery where first_gid is not null group by first_gid); | |
Run Time: real 13.398 user 0.695310 sys 2.170534 | |
changes: 19056 total_changes: 19056 | |
-- (deletable_rows-num_of_groups=19056, gid=first_gid deleted separately) | |
-- this also includes series consisting of only original gid + one updated gallery.. | |
sqlite> delete from gallery where gid in (select first_gid from gallery where first_gid is not null group by first_gid); | |
Run Time: real 8.796 user 0.569944 sys 1.651091 | |
changes: 22098 total_changes: 41154 | |
sqlite> commit; | |
Run Time: real 14.445 user 0.093518 sys 1.238609 | |
-- 580M -> 508M | |
sqlite> select count(*) from gallery; | |
433092 | |
sqlite> select count(*) from gallery where rating!=0 and rating < 3; | |
57751 | |
sqlite> select count(*),GROUP_CONCAT(first_gid),title from gallery group by title order by count(*) desc limit 25; | |
count(*)|GROUP_CONCAT(first_gid)|title | |
49||[Nori5rou] Imaizumin-chi wa Douyara Gal no Tamariba ni Natteru Rashii 4 [English] [Tamamo] [Digital] | |
28||MODS = IDIOT DISHONEST SONS OF WHORES | |
22|647556|1 | |
17||[Katatuka Kouji] Kanban Musume | |
16||test | |
12||ane no kareshi ni moteasobareta watashi _ library | |
12||Unknown Artist | |
12||Kaede & Suzu 1-6 | |
11||[Crimson] Chikubi de Sokuiki suru Joshidaisei 4 | |
9||[age31] Renshuu | |
9|1688908|Test | |
8|511916|hentai | |
8|255642|[Yamatogawa] Witchcraft | |
8||[Nori5rou] Imaizumin-chi wa Douyara Gal no Tamariba ni Natteru Rashii [English] [Tamamo] | |
8||[Naraku No Nimotsu] Hoshiguma''s Secret Contract (Arknight) [Incomplete] [Ongoing] | |
8|1402631,1522729|. | |
8||(C82) [Funi Funi Lab (Tamagoro)] Chichikko Bitch 2 (Fairy Tail) | |
7||姉のカレシにもてあそばれた私_図書館編 | |
7||unknown | |
7||title | |
7||[we53] Yukari-chan to (VOCALOID) | |
7||[Lunch] Koinaka | |
7|2547153|[Crimson] Virgin Train R | |
7||[Chacharan] Kono Tabi Jeanne to Aka-chan Tsukurimasu (Fate/Grand Order) | |
7||2 | |
Run Time: real 1.311 user 0.788409 sys 0.231355 | |
changes: 22098 total_changes: 41154 | |
sqlite> delete from gallery where female like "%'guro'%" or male like "%'guro'%"; | |
Run Time: real 13.037 user 0.561839 sys 1.031127 | |
changes: 3353 total_changes: 44507 | |
sqlite> delete from gallery where male like "%'vore'%" or female like "%'vore'%"; | |
Run Time: real 8.272 user 0.484399 sys 0.623145 | |
changes: 1844 total_changes: 46351 | |
sqlite> delete from gallery where male like "%'snuff'%" or female like "%'snuff'%"; | |
Run Time: real 5.643 user 0.480822 sys 0.625165 | |
changes: 2023 total_changes: 48374 | |
-- in some cases latest upload within series maybe badly tagged reupload, or this could | |
-- also mean that the entire series is duped by another first_gid, requires manual cleaning? | |
select count(*) from gallery where first_gid is not null and other="['already uploaded']"; | |
826 | |
-- about half have rating<3 | |
sqlite> select count(*) from gallery; | |
425872 | |
sqlite> delete from gallery where rating!=0 and rating < 3; | |
Run Time: real 11.382 user 1.089529 sys 3.913498 | |
changes: 56383 total_changes: 56383 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment