#include <x86intrin.h>
#include <cpuid.h>
#include <stdio.h>
#include <time.h>

#define N 100000000
#define I long long

// benchmark to demonstrate false dependencies in compression and expansion instructions (observed in skx/clx and zen4)

// (What were the inserts for again?  I forget.  Possibly they just lengthen the dependency chain to magnify the effect of the bug?)

#if defined(VBMI2) || defined(__AVX512VBMI2__)
#define VFS(F) \
 F(vcompresspd,vxorpd,vinsertf128) \
 F(vcompressps,vxorps,vinsertf128) \
 F(vpcompressq,vpxorq,vinserti128) \
 F(vpcompressd,vpxorq,vinserti128) \
 F(vpcompressw,vpxorq,vinserti128) \
 F(vpcompressb,vpxorq,vinserti128) \
 F(vexpandpd,  vxorpd,vinsertf128) \
 F(vexpandps,  vxorps,vinsertf128) \
 F(vpexpandq,  vpxorq,vinserti128) \
 F(vpexpandd,  vpxorq,vinserti128) \
 F(vpexpandw,  vpxorq,vinserti128) \
 F(vpexpandb,  vpxorq,vinserti128)
#else
#define VFS(F) \
 F(vcompresspd,vxorpd,vinsertf128) \
 F(vcompressps,vxorps,vinsertf128) \
 F(vpcompressq,vpxorq,vinserti128) \
 F(vpcompressd,vpxorq,vinserti128) \
 F(vexpandpd,  vxorpd,vinsertf128) \
 F(vexpandps,  vxorps,vinsertf128) \
 F(vpexpandq,  vpxorq,vinserti128) \
 F(vpexpandd,  vpxorq,vinserti128)
#endif

#define minx(x,y) (x<y?x:y)
#define maxx(x,y) (x>y?x:y)
#define CLOSE(x,y) (minx(x,y)*21>=maxx(x,y)*20) // within 5%
#define FAR(x,y)   (minx(x,y)*6 <maxx(x,y)*5)   // outside 20%
__attribute__((noinline)) void diagnose(const char *n, I t00,I t01,I t10,I t11) {
	int sane = CLOSE(t10,t11) && FAR(t00,maxx(t10,t11)) && t00 > maxx(t10,t11);
	int buggy;
	if (CLOSE(t01,t00)) { // close time to true dep
		buggy = 1;
	} else if (CLOSE(t01,t10)) { // close time to no dep
		buggy = 0;
	} else {
		sane = 0; // dunno
	}
	printf("%s: %s (%lld %lld %lld %lld)\n", n, sane ? buggy ? "bad" : "ok" : "dunno", t00, t01, t10, t11);
}
#define TIMEF(f) ({\
	int eax,ecx,edx,ebx;\
	f(N);\
	eax=0;__asm__ volatile("cpuid" : "+a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx));\
	I before = __rdtsc();\
	f(N);\
	I after = __rdtscp(&(unsigned){0});\
	eax=0;__asm__ volatile("cpuid" : "+a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx));\
	after - before;})
#define BENCH(comp,xor,insert) \
__attribute__((naked,noinline)) void comp##00(I dummy) { \
	__asm__ volatile( \
".intel_syntax noprefix\n" \
#xor " ymm0,ymm0,ymm0\n" \
#xor " ymm1,ymm1,ymm1\n" \
"kxorq k1,k1,k1\n" \
".balign 64\n" \
"1:\n" \
#comp " ymm0{k1},ymm1\n" \
#insert " ymm0,ymm0,xmm1,0\n" \
"dec rdi\n" \
"jnz 1b\n" \
"ret\n" \
".att_syntax prefix\n" \
	); \
} \
__attribute__((naked,noinline)) void comp##01(I dummy) { \
	__asm__ volatile( \
".intel_syntax noprefix\n" \
#xor " ymm0,ymm0,ymm0\n" \
#xor " ymm1,ymm1,ymm1\n" \
"kxorq k1,k1,k1\n" \
".balign 64\n" \
"1:\n" \
#comp " ymm0{k1}{z},ymm1\n" \
#insert " ymm0,ymm0,xmm1,0\n" \
"dec rdi\n" \
"jnz 1b\n" \
"ret\n" \
".att_syntax prefix\n" \
	); \
} \
__attribute__((naked,noinline)) void comp##10(I dummy) { \
	__asm__ volatile( \
".intel_syntax noprefix\n" \
#xor " ymm0,ymm0,ymm0\n" \
#xor " ymm1,ymm1,ymm1\n" \
"kxorq k1,k1,k1\n" \
".balign 64\n" \
"1:\n" \
#xor " ymm0,ymm0,ymm0\n" \
#comp " ymm0{k1},ymm1\n" \
#insert " ymm0,ymm0,xmm1,0\n" \
"dec rdi\n" \
"jnz 1b\n" \
"ret\n" \
".att_syntax prefix\n" \
	); \
} \
__attribute__((naked,noinline)) void comp##11(I dummy) { \
	__asm__ volatile( \
".intel_syntax noprefix\n" \
#xor " ymm0,ymm0,ymm0\n" \
#xor " ymm1,ymm1,ymm1\n" \
"kxorq k1,k1,k1\n" \
".balign 64\n" \
"1:\n" \
#xor " ymm0,ymm0,ymm0\n" \
#comp " ymm0{k1}{z},ymm1\n" \
#insert " ymm0,ymm0,xmm1,0\n" \
"dec rdi\n" \
"jnz 1b\n" \
"ret\n" \
".att_syntax prefix\n" \
	); \
} \
__attribute__((noinline)) void bench##comp() { \
	I t00=TIMEF(comp##00),t01=TIMEF(comp##01),t10=TIMEF(comp##10),t11=TIMEF(comp##11); \
	diagnose(#comp, t00, t01, t10, t11); \
}
VFS(BENCH)

#define CALL(f,x,y) bench##f();
int main() {
	VFS(CALL)
}
