Created
May 22, 2020 06:51
-
-
Save bortkiewicz/778299fafb02711ffaea1ca35210d6d7 to your computer and use it in GitHub Desktop.
bithybrid
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma GCC target("avx512f,avx512bw") | |
// writer: w33z8kqrqk8zzzx33 | |
#include <bits/stdc++.h> | |
#include <immintrin.h> | |
using namespace std; | |
// begin fast read template by CYJian (source: https://www.luogu.com.cn/paste/i11c3ppx) | |
namespace io { | |
const int __SIZE = (1 << 25) + 1; | |
char ibuf[__SIZE], *iS, *iT, obuf[__SIZE], *oS = obuf, *oT = oS + __SIZE - 1, __c, qu[55]; int __f, qr, _eof; | |
#define Gc() (iS == iT ? (iT = (iS = ibuf) + fread (ibuf, 1, __SIZE, stdin), (iS == iT ? EOF : *iS ++)) : *iS ++) | |
inline void flush () { fwrite (obuf, 1, oS - obuf, stdout), oS = obuf; } | |
inline void reverse() { iS = ibuf; } | |
inline void gc (char &x) { x = Gc(); } | |
inline void pc (char x) { *oS ++ = x; if (oS == oT) flush (); } | |
inline void pstr (const char *s) { int __len = strlen(s); for (__f = 0; __f < __len; ++__f) pc (s[__f]); } | |
inline void gstr (char *s) { for(__c = Gc(); __c < 32 || __c > 126 || __c == ' ';) __c = Gc(); | |
for(; __c > 31 && __c < 127 && __c != ' '; ++s, __c = Gc()) *s = __c; *s = 0; } | |
template <class I> inline bool gi (I &x) { _eof = 0; | |
for (__f = 1, __c = Gc(); (__c < '0' || __c > '9') && !_eof; __c = Gc()) { if (__c == '-') __f = -1; _eof |= __c == EOF; } | |
for (x = 0; __c <= '9' && __c >= '0' && !_eof; __c = Gc()) x = x * 10 + (__c & 15), _eof |= __c == EOF; x *= __f; return !_eof; } | |
template <class I> inline void print (I x) { if (!x) pc ('0'); if (x < 0) pc ('-'), x = -x; | |
while (x) qu[++ qr] = x % 10 + '0', x /= 10; while (qr) pc (qu[qr --]); } | |
struct Flusher_ {~Flusher_(){flush();}}io_flusher_; | |
} using io::pc; using io::gc; using io::pstr; using io::gstr; using io::gi; using io::print; | |
// end fast read template by CYJian | |
#define iter(i, a, b) for(int i=(a); i<(b); i++) | |
#define reti(i, a, b) for(int i=(b)-1; i>=(a); i--) | |
//#define rep(i, a) iter(i, 0, a) | |
#define rep1(i, a) iter(i, 1, (a)+1) | |
#define all(a) a.begin(), a.end() | |
#define fi first | |
#define se second | |
#define pb push_back | |
using ll=long long; | |
namespace Par { | |
#define rep(i, from, to) for (int i = from; i < (to); ++i) | |
#define trav(a, x) for (auto& a : x) | |
#define sz(x) (int)(x).size() | |
typedef long long ll; | |
typedef pair<int, int> pii; | |
typedef vector<int> vi; | |
#ifdef USE_AVX256 | |
typedef __m256i M; | |
#define MM(op) _mm256_ ## op | |
#define MM_ANDNOT(a,b) _mm256_andnot_si256(a,b) | |
#define ZERO MM(setzero_si256)() | |
#define MM_SHUFFLE_MASK(a,b,c,d) \ | |
_mm256_set_epi32(a,b,c,d, a,b,c,d) | |
#define M_WORDS 4 | |
#else | |
typedef __m512i M; | |
#define MM(op) _mm512_ ## op | |
#define MM_ANDNOT(a,b) _mm512_andnot_si512(a,b) | |
#define ZERO MM(setzero_si512)() | |
#define MM_SHUFFLE_MASK(a,b,c,d) \ | |
_mm512_set_epi32(a,b,c,d, a,b,c,d, a,b,c,d, a,b,c,d) | |
#define M_WORDS 8 | |
#endif | |
#define ONE MM(set1_epi32)(-1) | |
#define M_BITS (M_WORDS * 64) | |
union U { | |
M m; | |
uint64_t words[M_WORDS]; | |
uint16_t shorts[4*M_WORDS]; | |
}; | |
// m >> 1 | |
U srl1(U u) { | |
rep(i,0,M_WORDS-1) | |
u.words[i] = (u.words[i] >> 1) | (u.words[i + 1] << 63); | |
u.words[M_WORDS-1] >>= 1; | |
return u; | |
} | |
// m << 1 | |
U sll1(U u) { | |
for (int i = M_WORDS-1; i >= 1; i--) | |
u.words[i] = (u.words[i] << 1) | (u.words[i - 1] >> 63); | |
u.words[0] <<= 1; | |
return u; | |
} | |
void setbit(U& u, unsigned bit) { | |
u.words[bit >> 6] |= 1ULL << (bit & 63); | |
} | |
bool getbit(U& u, unsigned bit) { | |
return u.words[bit >> 6] & (1ULL << (bit & 63)); | |
} | |
ostream& operator<<(ostream& os, U u) { | |
rep(i,0,M_BITS) os << (getbit(u, i) ? '1' : '0'); | |
return os; | |
} | |
template <class F> | |
void generic_for_each(F&& f) {} | |
template <class F, class Arg, class... Args> | |
void generic_for_each(F&& f, Arg& arg, Args&... args) { | |
f(arg); | |
generic_for_each(f, args...); | |
} | |
struct State { | |
int a, b; | |
int evs[8]; | |
int evi; | |
int pos; | |
int tp; | |
M prev, acc; | |
M affected, naffected; | |
M ones, twos, fours, eights; | |
}; | |
struct Temp { | |
// Persistent state from State | |
M prev, acc, affected, naffected; | |
M ones, twos, fours, eights; | |
// Temporary state | |
M cur, next, local; | |
Temp() {} | |
Temp(State& s) : | |
prev(s.prev), acc(s.acc), | |
affected(s.affected), naffected(s.naffected), | |
ones(s.ones), twos(s.twos), fours(s.fours), eights(s.eights) | |
{} | |
void writeBack(State& s) { | |
s.prev = prev; | |
s.acc = acc; | |
s.affected = affected; | |
s.naffected = naffected; | |
s.ones = ones; | |
s.twos = twos; | |
s.fours = fours; | |
s.eights = eights; | |
} | |
}; | |
struct Op { | |
enum { IS_POPCNT = 0, NEED_NEXT = 0 }; | |
}; | |
struct Op1 : Op { | |
void handle(Temp& t) { | |
t.cur &= t.naffected; | |
} | |
}; | |
struct Op2 : Op { | |
void handle(Temp& t) { | |
t.cur |= t.affected; | |
} | |
}; | |
struct Op3 : Op { | |
enum { NEED_NEXT = 1 }; | |
void handle(Temp& t) { | |
t.cur |= t.next & t.affected; | |
} | |
}; | |
struct Op4 : Op { | |
void handle(Temp& t) { | |
M old = t.cur; | |
t.cur |= t.prev & t.affected; | |
t.prev = old; | |
} | |
}; | |
struct Op5 : Op { | |
enum { NEED_NEXT = 1 }; | |
void handle(Temp& t) { | |
t.cur &= t.next | t.naffected; | |
} | |
}; | |
struct Op6 : Op { | |
void handle(Temp& t) { | |
M old = t.cur; | |
t.cur &= t.prev | t.naffected; | |
t.prev = old; | |
} | |
}; | |
// Harley-Seal popcount from http://0x80.pl/articles/sse-popcount.html | |
void CSA(M& h, M& a, M b, M c) { | |
#ifdef USE_AVX256 | |
const M u = a ^ b; | |
h = (a & b) | (u & c); | |
a = u ^ c; | |
#else | |
M a2 = a; | |
a = _mm512_ternarylogic_epi32(c, b, a2, 0x96); | |
h = _mm512_ternarylogic_epi32(c, b, a2, 0xe8); | |
#endif | |
} | |
M popcount(const M v) { | |
const M m1 = MM(set1_epi8)(0x55); | |
const M m2 = MM(set1_epi8)(0x33); | |
const M m4 = MM(set1_epi8)(0x0F); | |
const M t1 = MM(sub_epi8)(v, (MM(srli_epi16)(v, 1) & m1)); | |
const M t2 = MM(add_epi8)(t1 & m2, (MM(srli_epi16)(t1, 2) & m2)); | |
const M t3 = MM(add_epi8)(t2, MM(srli_epi16)(t2, 4)) & m4; | |
return MM(sad_epu8)(t3, ZERO); | |
} | |
struct Op7 : Op { | |
enum { IS_POPCNT = 1 }; | |
void handle16(Temp& t, U* data) { | |
M twosA, twosB, foursA, foursB, eightsA, eightsB, sixteens; | |
CSA(twosA, t.ones, data[0].m & t.affected, data[1].m & t.affected); | |
CSA(twosB, t.ones, data[2].m & t.affected, data[3].m & t.affected); | |
CSA(foursA, t.twos, twosA, twosB); | |
CSA(twosA, t.ones, data[4].m & t.affected, data[5].m & t.affected); | |
CSA(twosB, t.ones, data[6].m & t.affected, data[7].m & t.affected); | |
CSA(foursB, t.twos, twosA, twosB); | |
CSA(eightsA, t.fours, foursA, foursB); | |
CSA(twosA, t.ones, data[8].m & t.affected, data[9].m & t.affected); | |
CSA(twosB, t.ones, data[10].m & t.affected, data[11].m & t.affected); | |
CSA(foursA, t.twos, twosA, twosB); | |
CSA(twosA, t.ones, data[12].m & t.affected, data[13].m & t.affected); | |
CSA(twosB, t.ones, data[14].m & t.affected, data[15].m & t.affected); | |
CSA(foursB, t.twos, twosA, twosB); | |
CSA(eightsB, t.fours, foursA, foursB); | |
CSA(sixteens, t.eights, eightsA, eightsB); | |
t.acc = MM(add_epi64)(t.acc, | |
MM(slli_epi64)(popcount(sixteens), 4)); | |
} | |
// pshufb-based popcnt, see http://0x80.pl/articles/sse-popcount.html | |
void handle(Temp& t) { | |
const M lowMask = MM(set1_epi8)(0xf); | |
const M popcntLookup = MM_SHUFFLE_MASK( | |
0x04030302, | |
0x03020201, | |
0x03020201, | |
0x02010100 | |
); | |
const M vec = t.cur & t.affected; | |
const M lo = vec & lowMask; | |
const M hi = MM(srli_epi16)(vec, 4) & lowMask; | |
const M popcnt1 = MM(shuffle_epi8)(popcntLookup, lo); | |
const M popcnt2 = MM(shuffle_epi8)(popcntLookup, hi); | |
t.local = MM(add_epi8)(t.local, popcnt1); | |
t.local = MM(add_epi8)(t.local, popcnt2); | |
} | |
void handle8th(Temp& t) { | |
t.acc = MM(add_epi64)(t.acc, MM(sad_epu8)(t.local, ZERO)); | |
} | |
}; | |
template<class F> | |
void withOp(int tp, F&& f) { | |
switch (tp) { | |
case 1: f(Op1()); break; | |
case 2: f(Op2()); break; | |
case 3: f(Op3()); break; | |
case 4: f(Op4()); break; | |
case 5: f(Op5()); break; | |
case 6: f(Op6()); break; | |
case 7: f(Op7()); break; | |
default: assert(0); | |
} | |
} | |
constexpr int pad = 2; | |
constexpr int PREFIX_PAD = 8; | |
State states[pad]; | |
U prefixes[M_BITS+1 + PREFIX_PAD*2]; | |
U* blocks; | |
int nblocks; | |
int nops; | |
void computeAffected(State& s) { | |
int at = s.pos; | |
if (at < s.evs[s.evi]) | |
return; | |
while (at >= s.evs[s.evi]) | |
s.evi++; | |
// Compute mask for which of the entries corresponding to bits | |
// 0..63 should be updated when iterating over the range | |
// [s.evs[s.evi-1], s.evs[s.evi]). Because of how "evs" was | |
// constructed, this will be constant over that range (if we | |
// interpret infinities correctly) and we can sample a random | |
// point "at" in the interval for computing it. | |
// | |
// The mask should have bit i set if: | |
// a <= i * nblocks + at < b | |
// (a - at) / nblocks <= i < (b - at) / nblocks | |
// ceil((a - at) / nblocks) <= i < ceil((b - at) / nblocks) | |
// | |
// We assume this range is contained in [-PREFIX_PAD, M_BITS + PREFIX_PAD]. | |
auto ceildiv = [](int a) { | |
// a can be negative, causing a badly rounded division, but not by very | |
// much. Compensate by adding PREFIX_PAD*b. | |
return (a + PREFIX_PAD*nblocks + nblocks-1) / nblocks - PREFIX_PAD; | |
}; | |
int low = ceildiv(s.a - at); | |
int high = ceildiv(s.b - at); | |
assert(low <= high); | |
assert(-PREFIX_PAD <= low); | |
assert(high <= M_BITS + PREFIX_PAD); | |
s.affected = MM_ANDNOT(prefixes[PREFIX_PAD + low].m, prefixes[PREFIX_PAD + high].m); | |
s.naffected = MM_ANDNOT(s.affected, ONE); | |
} | |
void advanceOne(State& s, int goal) { | |
int at = s.pos; | |
assert(at < goal); | |
computeAffected(s); | |
int from = at, to = min(goal, s.evs[s.evi]); | |
assert(from < to); | |
Temp t(s); | |
withOp(s.tp, [&](auto&& op) { | |
rep(i,from,to) { | |
t.local = ZERO; | |
t.cur = blocks[i].m; | |
if constexpr (op.NEED_NEXT) t.next = blocks[i+1].m; | |
op.handle(t); | |
if constexpr (op.IS_POPCNT) op.handle8th(t); | |
else blocks[i].m = t.cur; | |
} | |
}); | |
t.writeBack(s); | |
s.pos = to; | |
} | |
void advance(State& s, int goal) { | |
while (s.pos != goal) | |
advanceOne(s, goal); | |
} | |
template<class... Ops> | |
void runLockstepLoop(int start, int goal, Ops&... ops) { | |
constexpr int nops = (int) sizeof...(ops); | |
assert(start < goal); | |
Temp temps[nops]; | |
rep(i,0,nops) temps[i] = Temp(states[i]); | |
rep(i,0,nops) assert(states[i].pos == start + (nops-1 - i)); | |
const int CHUNK = 16; | |
int i = start; | |
for (; i + CHUNK <= goal; i += CHUNK) { | |
int ind = 0; | |
generic_for_each([&](auto&& op) { | |
Temp& t = temps[ind]; | |
if constexpr (op.IS_POPCNT) { | |
op.handle16(t, &blocks[i + (nops-1 - ind)]); | |
} else { | |
rep(it,0,CHUNK) { | |
int j = i + it + (nops-1 - ind); | |
t.cur = blocks[j].m; | |
if constexpr (op.NEED_NEXT) t.next = blocks[j+1].m; | |
op.handle(t); | |
blocks[j].m = t.cur; | |
} | |
} | |
ind++; | |
}, ops...); | |
} | |
rep(j,0,nops) temps[j].local = ZERO; | |
for (; i < goal; ++i) { | |
int ind = 0; | |
generic_for_each([&](auto&& op) { | |
int j = i + (nops-1 - ind); | |
Temp& t = temps[ind]; | |
t.cur = blocks[j].m; | |
if constexpr (op.NEED_NEXT) t.next = blocks[j+1].m; | |
op.handle(t); | |
if constexpr (!op.IS_POPCNT) blocks[j].m = t.cur; | |
ind++; | |
}, ops...); | |
} | |
{ | |
int ind = 0; | |
generic_for_each([&](auto&& op) { | |
if constexpr (op.IS_POPCNT) op.handle8th(temps[ind]); | |
ind++; | |
}, ops...); | |
} | |
rep(i,0,nops) temps[i].writeBack(states[i]); | |
rep(i,0,nops) states[i].pos = goal + (nops-1 - i); | |
} | |
template<class... Ops> | |
void loopLockstep(int goal, Ops&... ops) { | |
constexpr int nops = sizeof...(ops); | |
if constexpr (nops == 0) return; | |
int pos = states[nops-1].pos; | |
while (pos < goal) { | |
int to = goal; | |
rep(i,0,nops) { | |
computeAffected(states[i]); | |
to = min(to, states[i].evs[states[i].evi] - (nops-1 - i)); | |
} | |
runLockstepLoop(pos, to, ops...); | |
pos = to; | |
} | |
} | |
template<class... Ops> | |
void recLockstep(int goal, Ops&&... ops) { | |
constexpr int i = sizeof...(ops); | |
if constexpr (i > pad) { | |
assert(0); | |
} else if (i == nops) { | |
loopLockstep(goal, ops...); | |
} else if constexpr (i < pad) { | |
withOp(states[i].tp, [&](auto&& op) { | |
recLockstep(goal, ops..., op); | |
}); | |
} else { | |
assert(0); | |
} | |
} | |
int main() { | |
// Given a bit array A of size N and Q operations of the types: | |
// 1. for every i in [a, b), set A[i] = 0 | |
// 2. for every i in [a, b), set A[i] = 1 | |
// 3. for every i in [a, b) set A[i] = A[i] | A[i+1] | |
// 4. for every i in [a, b) set A[i] = A[i] | A[i-1] | |
// 5. for every i in [a, b) set A[i] = A[i] & A[i+1] | |
// 6. for every i in [a, b) set A[i] = A[i] & A[i-1] | |
// 7. compute popcount of [a, b) | |
// Computes the array A after all operations have been performed. | |
int N, Q; | |
gi(N), gi(Q); | |
nblocks = max((N + M_BITS-1) / M_BITS, pad * 2 + 5); | |
prefixes[0+PREFIX_PAD].m = ZERO; | |
rep(i,0,M_BITS) { | |
prefixes[i+1+PREFIX_PAD] = prefixes[i+PREFIX_PAD]; | |
setbit(prefixes[i+1+PREFIX_PAD], i); | |
} | |
rep(i,0,PREFIX_PAD) prefixes[i].m = ZERO; | |
rep(i,0,PREFIX_PAD) prefixes[PREFIX_PAD+M_BITS+1 + i].m = ONE; | |
// Order the bits so that block i -- a 256-bit AVX vector -- consists of | |
// bits nblocks*j + i for j = 0..255. With that representation, shifting | |
// left/right by one bit becomes shifting left/right by one block, except | |
// at the edges. Picture: (with W = nblocks) | |
// | |
// 0 | 1 | 2 | 3 | 4 | ... | W-1 | |
// W | W+1 | W+2 | W+3 | W+4 | | 2*W-1 | |
// ... | |
// 255*W | ... | | N-1 | 0 | ... | 0 | |
// blocks[0] | blocks[1] | blocks[2] | | | ... | blocks[W-1] | |
blocks = (U*)_mm_malloc((nblocks + 2 * pad) * sizeof(U), sizeof(U)) + pad; | |
rep(i,0,N) { | |
int k; gi(k); | |
int bit = i / nblocks, block = i % nblocks; | |
assert(bit < M_BITS); | |
if (k) setbit(blocks[block], bit); | |
} | |
const M one = ONE; | |
// Q = min(Q, 500'000); | |
int qi = 0; | |
while (qi < Q) { | |
nops = min(Q - qi, pad); | |
qi += nops; | |
// Explicitly compute the edge wrap-around and store it as padding | |
// around the actual blocks. | |
rep(i,0,pad) { | |
blocks[i - pad] = sll1(blocks[nblocks + i - pad]); | |
blocks[nblocks + i] = srl1(blocks[i]); | |
} | |
rep(i,0,nops) { | |
int tp, a, b; | |
gi(tp), gi(a), gi(b); | |
--a; | |
if (tp == 3 || tp == 5) b--; | |
if (tp == 4 || tp == 6) a++; | |
assert(0 <= a && a <= b && b <= N); | |
if (tp == 3 || tp == 5) assert(b < N); | |
if (tp == 4 || tp == 6) assert(a > 0); | |
// tp = 7; | |
State& s = states[i]; | |
s.tp = tp; | |
s.a = a; | |
s.b = b; | |
s.prev = ZERO; | |
s.acc = ZERO; | |
s.affected = ZERO; | |
s.naffected = one; | |
s.ones = ZERO; | |
s.twos = ZERO; | |
s.fours = ZERO; | |
s.eights = ZERO; | |
// One too much, just to compute "prev". It only results in unused | |
// padding being incorrect. | |
s.pos = tp == 7 ? 0 : -nops + i; | |
s.evi = 0; | |
int* evs = s.evs; | |
evs[0] = INT_MIN; | |
evs[1] = 0; | |
evs[2] = a % nblocks; | |
evs[3] = b % nblocks; | |
evs[4] = nblocks; | |
int nevs = 5; | |
for (int x : {evs[2], evs[3]}) for (int delta : {-nblocks, nblocks}) { | |
int y = x + delta; | |
if (-pad <= y && y <= nblocks + pad) evs[nevs++] = y; | |
} | |
assert(nevs <= 7); | |
evs[nevs++] = INT_MAX; | |
sort(evs + 1, evs + nevs - 1); | |
} | |
// Initial part | |
rep(i,0,nops) { | |
advance(states[i], nops-1 - i); | |
} | |
// Middle part, in lock-step, until where states with tp == 7 need to stop | |
int goal = nblocks - nops; | |
recLockstep(goal); | |
// Final part | |
rep(i,0,nops) { | |
int to = nblocks + (states[i].tp == 7 ? 0 : nops-1 - i); | |
advance(states[i], to); | |
} | |
rep(i,0,nops) if (states[i].tp == 7) { | |
State& s = states[i]; | |
M acc = s.acc; | |
acc = MM(add_epi64)(acc, MM(slli_epi64)(popcount(s.eights), 3)); | |
acc = MM(add_epi64)(acc, MM(slli_epi64)(popcount(s.fours), 2)); | |
acc = MM(add_epi64)(acc, MM(slli_epi64)(popcount(s.twos), 1)); | |
acc = MM(add_epi64)(acc, popcount(s.ones)); | |
ll sum = 0; | |
U u; | |
u.m = acc; | |
rep(j,0,M_WORDS) sum += u.words[j]; | |
printf("%d\n", (int)sum); | |
} | |
} | |
#ifdef PRINT_FINAL | |
rep(i,0,N) { | |
int bit = i / nblocks, block = i % nblocks; | |
assert(bit < M_BITS); | |
putchar(getbit(blocks[block], bit) ? '1' : '0'); | |
} | |
putchar('\n'); | |
#endif | |
fflush(stdout); | |
return 0; | |
} | |
} | |
namespace Lin { | |
using M=__m512i; | |
union U { M m; uint64_t words[8]; }; | |
inline void setbit(U& u, const unsigned& bit) { u.words[bit>>6] |= 1ull<<(bit&63);} | |
void RR(U& u) { | |
u.words[0] = (u.words[0]>>1) | (u.words[1]<<63); | |
u.words[1] = (u.words[1]>>1) | (u.words[2]<<63); | |
u.words[2] = (u.words[2]>>1) | (u.words[3]<<63); | |
u.words[3] = (u.words[3]>>1) | (u.words[4]<<63); | |
u.words[4] = (u.words[4]>>1) | (u.words[5]<<63); | |
u.words[5] = (u.words[5]>>1) | (u.words[6]<<63); | |
u.words[6] = (u.words[6]>>1) | (u.words[7]<<63); | |
u.words[7] = u.words[7]>>1; | |
} | |
void LL(U& u) { | |
u.words[7] = (u.words[7]<<1) | (u.words[6]>>63); | |
u.words[6] = (u.words[6]<<1) | (u.words[5]>>63); | |
u.words[5] = (u.words[5]<<1) | (u.words[4]>>63); | |
u.words[4] = (u.words[4]<<1) | (u.words[3]>>63); | |
u.words[3] = (u.words[3]<<1) | (u.words[2]>>63); | |
u.words[2] = (u.words[2]<<1) | (u.words[1]>>63); | |
u.words[1] = (u.words[1]<<1) | (u.words[0]>>63); | |
u.words[0] = u.words[0]<<1; | |
} | |
int opt, l, r, sum; | |
U* blocks; | |
U prefixes[513]; | |
int nblocks; | |
namespace SimdSum { | |
struct sse_vector final { | |
__m128i v; | |
sse_vector() = delete; | |
sse_vector(sse_vector&) = delete; | |
explicit sse_vector(const __m128i& vec): v(vec) {} | |
}; | |
__m128i operator&(sse_vector a, sse_vector b) { return _mm_and_si128(a.v, b.v); } | |
__m128i operator|(sse_vector a, sse_vector b) { return _mm_or_si128(a.v, b.v); } | |
__m128i operator^(sse_vector a, sse_vector b) { return _mm_xor_si128(a.v, b.v); } | |
struct shift16 final { const unsigned bits; shift16() = delete; explicit shift16(unsigned bits) : bits(bits) {}; }; | |
__m128i operator>>(const __m128i a, const shift16 amount) { return _mm_srli_epi16(a, amount.bits); } | |
uint64_t lower_qword(const __m128i v) { return _mm_cvtsi128_si64(v); } | |
uint64_t higher_qword(const __m128i v) { return lower_qword(_mm_srli_si128(v, 8)); } | |
uint64_t simd_sum_epu64(const __m128i v) { return lower_qword(v) + higher_qword(v); } | |
uint64_t simd_sum_epu64(const __m256i v) { return static_cast<uint64_t>(_mm256_extract_epi64(v, 0)) + static_cast<uint64_t>(_mm256_extract_epi64(v, 1)) + static_cast<uint64_t>(_mm256_extract_epi64(v, 2)) + static_cast<uint64_t>(_mm256_extract_epi64(v, 3)); } | |
uint64_t simd_sum_epu64(const __m512i v) { | |
const __m256i lo = _mm512_extracti64x4_epi64(v, 0); | |
const __m256i hi = _mm512_extracti64x4_epi64(v, 1); | |
return simd_sum_epu64(lo) + simd_sum_epu64(hi); | |
} | |
}; | |
namespace AVX512_harley_seal { | |
__m512i popcount(const __m512i v) { | |
const __m512i m1 = _mm512_set1_epi8(0x55); | |
const __m512i m2 = _mm512_set1_epi8(0x33); | |
const __m512i m4 = _mm512_set1_epi8(0x0F); | |
const __m512i t1 = _mm512_sub_epi8(v, (_mm512_srli_epi16(v, 1) & m1)); | |
const __m512i t2 = _mm512_add_epi8(t1 & m2, (_mm512_srli_epi16(t1, 2) & m2)); | |
const __m512i t3 = _mm512_add_epi8(t2, _mm512_srli_epi16(t2, 4)) & m4; | |
return _mm512_sad_epu8(t3, _mm512_setzero_si512()); | |
} | |
void CSA(__m512i& h, __m512i& l, __m512i a, __m512i b, __m512i c) { | |
l = _mm512_ternarylogic_epi32(c, b, a, 0x96); | |
h = _mm512_ternarylogic_epi32(c, b, a, 0xe8); | |
} | |
uint64_t popcnt(const U* data, __m512i mask, const uint64_t size) { | |
__m512i total = _mm512_setzero_si512(); | |
__m512i ones = _mm512_setzero_si512(); | |
__m512i twos = _mm512_setzero_si512(); | |
__m512i fours = _mm512_setzero_si512(); | |
__m512i eights = _mm512_setzero_si512(); | |
__m512i sixteens = _mm512_setzero_si512(); | |
__m512i twosA, twosB, foursA, foursB, eightsA, eightsB; | |
const uint64_t limit = size - size % 16; | |
uint64_t i = 0; | |
for(; i < limit; i += 16) { | |
CSA(twosA, ones, ones, data[i+0].m & mask, data[i+1].m & mask); | |
CSA(twosB, ones, ones, data[i+2].m & mask, data[i+3].m & mask); | |
CSA(foursA, twos, twos, twosA, twosB); | |
CSA(twosA, ones, ones, data[i+4].m & mask, data[i+5].m & mask); | |
CSA(twosB, ones, ones, data[i+6].m & mask, data[i+7].m & mask); | |
CSA(foursB, twos, twos, twosA, twosB); | |
CSA(eightsA,fours, fours, foursA, foursB); | |
CSA(twosA, ones, ones, data[i+8].m & mask, data[i+9].m & mask); | |
CSA(twosB, ones, ones, data[i+10].m & mask, data[i+11].m & mask); | |
CSA(foursA, twos, twos, twosA, twosB); | |
CSA(twosA, ones, ones, data[i+12].m & mask, data[i+13].m & mask); | |
CSA(twosB, ones, ones, data[i+14].m & mask, data[i+15].m & mask); | |
CSA(foursB, twos, twos, twosA, twosB); | |
CSA(eightsB, fours, fours, foursA, foursB); | |
CSA(sixteens, eights, eights, eightsA, eightsB); | |
total = _mm512_add_epi64(total, popcount(sixteens)); | |
} | |
total = _mm512_slli_epi64(total, 4); // * 16 | |
total = _mm512_add_epi64(total, _mm512_slli_epi64(popcount(eights), 3)); // += 8 * ... | |
total = _mm512_add_epi64(total, _mm512_slli_epi64(popcount(fours), 2)); // += 4 * ... | |
total = _mm512_add_epi64(total, _mm512_slli_epi64(popcount(twos), 1)); // += 2 * ... | |
total = _mm512_add_epi64(total, popcount(ones)); | |
for(; i < size; i++) | |
total = _mm512_add_epi64(total, popcount(data[i].m & mask)); | |
return SimdSum::simd_sum_epu64(total); | |
} | |
} // AVX512_harley_seal | |
void process(int L, int R) { | |
M u; | |
int low = (l-L-1+(nblocks<<4))/nblocks-15; | |
int high = (r-L-1+(nblocks<<4))/nblocks-16; | |
if(opt == 1 || opt == 5 || opt == 6) u = (high < 0) ? _mm512_set1_epi32(-1) : _mm512_andnot_si512(_mm512_andnot_si512(prefixes[low].m, prefixes[high+1].m), _mm512_set1_epi32(-1)); | |
else u = (high < 0) ? _mm512_setzero_si512() : _mm512_andnot_si512(prefixes[low].m, prefixes[high+1].m); | |
L++; R++; | |
if(opt == 1) iter(i, L, R) blocks[i].m &= u; | |
else if(opt == 2) iter(i, L, R) blocks[i].m |= u; | |
else if(opt == 3) iter(i, L, R) blocks[i].m |= blocks[i+1].m & u; | |
else if(opt == 4) reti(i, L, R) blocks[i].m |= blocks[i-1].m & u; | |
else if(opt == 5) iter(i, L, R) blocks[i].m &= blocks[i+1].m | u; | |
else if(opt == 6) reti(i, L, R) blocks[i].m &= blocks[i-1].m | u; | |
else sum += AVX512_harley_seal::popcnt(blocks+L, u, R-L); | |
} | |
signed main() { | |
ios_base::sync_with_stdio(false); cin.tie(0); | |
int N, Q; gi(N), gi(Q); | |
nblocks = max((N+511)/512, 1); | |
prefixes[0].m = prefixes[1].m = _mm512_setzero_si512(); | |
prefixes[1].words[0] |= 1; | |
iter(i, 2, 513) { | |
prefixes[i].m = prefixes[i-1].m; | |
prefixes[i].words[(i-1)>>6] |= 1ull<<((i-1)&63); | |
} | |
blocks = (U*)_mm_malloc((nblocks+2)*(sizeof(U)), sizeof(U)); | |
iter(i, 0, N) { | |
int k; gi(k); | |
if(k) setbit(blocks[i % nblocks+1], i / nblocks); | |
} | |
while(Q--) { | |
gi(opt), gi(l), gi(r); | |
l -= (opt != 4 && opt != 6); | |
r -= (opt == 3 || opt == 5); | |
if(opt == 4 || opt == 6) blocks[0] = blocks[nblocks], LL(blocks[0]); | |
else if(opt == 3 || opt == 5) blocks[nblocks+1] = blocks[1], RR(blocks[nblocks+1]); | |
int v2 = l%nblocks, v3 = r%nblocks; | |
if(v2 > v3) swap(v2, v3); | |
sum = 0; | |
if(opt == 4 || opt == 6) process(v3, nblocks), process(v2, v3), process(0, v2); | |
else process(0, v2), process(v2, v3), process(v3, nblocks); | |
if(opt == 7) print(sum), pc('\n'); | |
} | |
return 0; | |
} | |
} | |
int main() { | |
int N, Q; gi(N), gi(Q); | |
io::iT++; | |
int tmp; | |
iter(i, 0, N) gi(tmp); | |
int c7 = 0; | |
iter(i, 0, Q) { int a, b, c; gi(a), gi(b), gi(c); c7 += (a == 7); } | |
io::reverse(); | |
if(c7 > N/2) Lin::main(); | |
else Par::main(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment