Skip to content

Instantly share code, notes, and snippets.

@poontology
Last active September 15, 2024 17:51
Show Gist options
  • Save poontology/6f5eefc28268e36f877589f0768e782e to your computer and use it in GitHub Desktop.
Save poontology/6f5eefc28268e36f877589f0768e782e to your computer and use it in GitHub Desktop.
Cleanup log of api_dump.sqlite from https://sukebei.nyaa.si/view/4045598
$ sqlite3 api_dump.sqlite
SQLite version 3.46.1 2024-08-13 09:16:08
Enter ".help" for usage hints.
sqlite> select count(*) from gallery;
2378815
sqlite> .changes on
sqlite> .timer on
sqlite> delete from gallery where category!='Manga' and category!='Doujinshi';
Run Time: real 129.907 user 28.681102 sys 63.180995
changes: 1489668 total_changes: 1489668
sqlite> select count(*) from gallery;
889147
-- only interested in original jp and eng translations
sqlite> select count(*),language from gallery where language not like "%'english'%" group by language order by count(*);
61|['rewrite']
81|['speechless', 'text cleaned']
...
712|['translated']
803|['translated', 'ukrainian']
839|['polish', 'translated']
1323|['speechless']
1361|['indonesian', 'translated']
2847|['text cleaned']
3067|['german', 'translated']
4434|['korean']
4497|['translated', 'vietnamese']
4966|['italian', 'translated']
7839|['chinese']
8936|['thai', 'translated']
10522|['french', 'translated']
16601|['portuguese', 'translated']
22512|['russian', 'translated']
49605|['spanish', 'translated']
80081|['korean', 'translated']
136210|['chinese', 'translated']
Run Time: real 0.882 user 0.612735 sys 0.268879
sqlite> delete from gallery where male like "%'males only'%";
Run Time: real 37.653 user 2.493469 sys 6.744719
changes: 66360 total_changes: 1556028
sqlite> delete from gallery where language is not null and language not like "%'english'%" and language!="['translated']" and language!="['speechless']" and language!="['text cleaned']";
Run Time: real 61.197 user 7.647673 sys 16.797743
changes: 335761 total_changes: 1891789
sqlite> delete from gallery where female like "%'females only'%";
Run Time: real 26.156 user 0.660319 sys 1.953976
changes: 12780 total_changes: 1904569
-- above reductions bring filesize from 1.7G -> 580M
-- something else to consider for removal
sqlite> select count(*) from gallery where male like "%'furry'%" and female like "%'furry'%";
2251
sqlite> select count(*) from gallery where male like "%'dilf'%";
24355
sqlite> select count(*) from gallery where rating < 3;
64636
sqlite> select count(*) from gallery where rating!=0 and rating < 3;
61894
-- huh?
sqlite> select count(*) from gallery where removed=1 and torrentcount>0;
55
sqlite> select count(*) from gallery where removed=1 and torrents is not null;
24
-- ongoing series etc with partial galleries, we only want the latest one
sqlite> select count(*),title from gallery group by title having count(*)>5 order by count(*) desc limit 5;
83|[dogado] Homo Sexience [Ongoing]
74|MusSoap [On Going]
70|[Korotsuke] Nekura Megane ♀
58|[Kidouchi_Kon] GAME/DEATH
54|[nu-] Bibia Saikou ka yo!
Run Time: real 1.068 user 0.832136 sys 0.218259
sqlite> .header on
-- titles within series can differ, use first_gid instead it points to first entry in series
sqlite> select count(*),datetime(min(posted),'auto') as oldest,datetime(max(posted),'auto') as newest,title from gallery where first_gid is not null group by first_gid having count(*)>5 order by count(*) desc limit 5;
count(*)|oldest|newest|title
84|2016-02-08 20:32:28|2017-10-05 20:25:32|[Insane] Love Parameter Ch.1-112 (English)
82|2014-03-19 13:19:25|2015-01-06 15:20:27|[dogado] Homo Sexience [Ongoing]
80|2016-09-05 01:49:21|2020-07-11 22:45:53|MusSoap [On Going]
66|2017-04-21 20:35:41|2019-02-18 22:18:50|[Worin] Brawling Go Ch.76-148 [English] [Ongoing]
64|2022-02-18 05:55:15|2024-01-20 02:35:15|[Himeno Mikan] Toki o Kakeru [Ongoing]
Run Time: real 0.283 user 0.126232 sys 0.156648
-- count extra rows that share first_gid (we want to keep latest one but also delete where gid=first_gid which isn't counted here so # of deletable_rows is still accurate)
sqlite> select count(*) num_of_groups,sum(c) deletable_rows from (select count(*) c from gallery where first_gid is not null group by first_gid having count(*) > 1);
num_of_groups|deletable_rows
5875|24931
Run Time: real 0.271 user 0.117217 sys 0.153546
sqlite> begin;
sqlite> delete from gallery where first_gid is not null and gid not in (select max(gid) from gallery where first_gid is not null group by first_gid);
Run Time: real 13.398 user 0.695310 sys 2.170534
changes: 19056 total_changes: 19056
-- (deletable_rows-num_of_groups=19056, gid=first_gid deleted separately)
-- this also includes series consisting of only original gid + one updated gallery..
sqlite> delete from gallery where gid in (select first_gid from gallery where first_gid is not null group by first_gid);
Run Time: real 8.796 user 0.569944 sys 1.651091
changes: 22098 total_changes: 41154
sqlite> commit;
Run Time: real 14.445 user 0.093518 sys 1.238609
-- 580M -> 508M
sqlite> select count(*) from gallery;
433092
sqlite> select count(*) from gallery where rating!=0 and rating < 3;
57751
sqlite> select count(*),GROUP_CONCAT(first_gid),title from gallery group by title order by count(*) desc limit 25;
count(*)|GROUP_CONCAT(first_gid)|title
49||[Nori5rou] Imaizumin-chi wa Douyara Gal no Tamariba ni Natteru Rashii 4 [English] [Tamamo] [Digital]
28||MODS = IDIOT DISHONEST SONS OF WHORES
22|647556|1
17||[Katatuka Kouji] Kanban Musume
16||test
12||ane no kareshi ni moteasobareta watashi _ library
12||Unknown Artist
12||Kaede & Suzu 1-6
11||[Crimson] Chikubi de Sokuiki suru Joshidaisei 4
9||[age31] Renshuu
9|1688908|Test
8|511916|hentai
8|255642|[Yamatogawa] Witchcraft
8||[Nori5rou] Imaizumin-chi wa Douyara Gal no Tamariba ni Natteru Rashii [English] [Tamamo]
8||[Naraku No Nimotsu] Hoshiguma''s Secret Contract (Arknight) [Incomplete] [Ongoing]
8|1402631,1522729|.
8||(C82) [Funi Funi Lab (Tamagoro)] Chichikko Bitch 2 (Fairy Tail)
7||姉のカレシにもてあそばれた私_図書館編
7||unknown
7||title
7||[we53] Yukari-chan to (VOCALOID)
7||[Lunch] Koinaka
7|2547153|[Crimson] Virgin Train R
7||[Chacharan] Kono Tabi Jeanne to Aka-chan Tsukurimasu (Fate/Grand Order)
7||2
Run Time: real 1.311 user 0.788409 sys 0.231355
changes: 22098 total_changes: 41154
sqlite> delete from gallery where female like "%'guro'%" or male like "%'guro'%";
Run Time: real 13.037 user 0.561839 sys 1.031127
changes: 3353 total_changes: 44507
sqlite> delete from gallery where male like "%'vore'%" or female like "%'vore'%";
Run Time: real 8.272 user 0.484399 sys 0.623145
changes: 1844 total_changes: 46351
sqlite> delete from gallery where male like "%'snuff'%" or female like "%'snuff'%";
Run Time: real 5.643 user 0.480822 sys 0.625165
changes: 2023 total_changes: 48374
-- in some cases latest upload within series maybe badly tagged reupload, or this could
-- also mean that the entire series is duped by another first_gid, requires manual cleaning?
select count(*) from gallery where first_gid is not null and other="['already uploaded']";
826
-- about half have rating<3
sqlite> select count(*) from gallery;
425872
sqlite> delete from gallery where rating!=0 and rating < 3;
Run Time: real 11.382 user 1.089529 sys 3.913498
changes: 56383 total_changes: 56383
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment