package
0.2.3
Repository: https://github.com/viant/vec.git
Documentation: pkg.go.dev

# README

Vectorized Bitwise Operations

Introduction

Vectorized bitwise operations use SIMD (Single Instruction Multiple Data) CPU extension to perform 128/256/512 bits wide operation per CPU cycle. This library uses the following extension: AVX2, AVX512, Neon, SVE. All implemented cases provide substantial performance benefit over go lang '&', '|', '^' operators.

Usage:

Vectorized AND

package mypkg;

import "github.com/viant/vec/bitwise"

func ExampleUint64s_And() {
	{
		out := bitwise.Uint64s(make([]uint64, 8))
		v1 := []uint64{1, 2, 3, 4, 5, 6, 7, 8}
		v2 := []uint64{1, 7, 3, 4, 3, 6, 7, 2}
		out.And(v1, v2)
	}
	{
		out := make([]uint64, 8)
		v1 := []uint64{1, 2, 3, 4, 5, 6, 7, 8}
		v2 := []uint64{1, 7, 3, 4, 3, 6, 7, 2}
		v3 := []uint64{1, 1, 0, 4, 1, 6, 7, 2}
		bitwise.Uint64s(out).AndV3(v1, v2, v3)
	}
}

Vectorized OR

package mypkg;

import "github.com/viant/vec/bitwise"

func ExampleUint64s_Or() {
	{
		out := make([]uint64, 8)
		v1 := []uint64{1, 2, 3, 4, 5, 6, 7, 8}
		v2 := []uint64{1, 7, 3, 4, 3, 6, 7, 2}
		bitwise.Uint64s(out).Or(v1, v2)
	}
	{
		out := bitwise.Uint64s(make([]uint64, 8))
		v1 := []uint64{1, 2, 3, 4, 5, 6, 7, 8}
		v2 := []uint64{1, 7, 3, 4, 3, 6, 7, 2}
		v3 := []uint64{1, 1, 0, 4, 1, 6, 7, 2}
		out.OrV3(v1, v2, v3)
	}
}

Vectorized XOR

package mypkg;

import "github.com/viant/vec/bitwise"

func ExampleUint64s_XOr() {
	{
		out := bitwise.Uint64s(make([]uint64, 8))
		v1 := []uint64{1, 2, 3, 4, 5, 6, 7, 8}
		v2 := []uint64{1, 7, 3, 4, 3, 6, 7, 2}
		out.Xor(v1, v2)
	}
	{
		out := bitwise.Uint64s(make([]uint64, 8))
		v1 := []uint64{1, 2, 3, 4, 5, 6, 7, 8}
		v2 := []uint64{1, 7, 3, 4, 3, 6, 7, 2}
		v3 := []uint64{1, 1, 0, 4, 1, 6, 7, 2}
		out.XorV3(v1, v2, v3)
	}
}

Benchmarks

The benchmark uses 3 sets of []uints Small - [8]uint64 Medium - [32]uint64 XLarge - [128]uint64

ARM64 (Neon/SVE)

Apple M1
goos: darwin
goarch: arm64
pkg: github.com/viant/vec/bitwise
BenchmarkAnd_S_Arm64Neon-8      362116297                3.131 ns/op           0 B/op          0 allocs/op
BenchmarkAnd_M_Arm64Neon-8      100000000               11.54 ns/op            0 B/op          0 allocs/op
BenchmarkAnd_XL_Arm64Neon-8     48047887                24.22 ns/op            0 B/op          0 allocs/op
BenchmarkAnd_M_V3_Neon-8        100000000               11.53 ns/op            0 B/op          0 allocs/op
BenchmarkAnd_S-8                348350084                3.449 ns/op           0 B/op          0 allocs/op
BenchmarkAnd_M-8                150012367                7.994 ns/op           0 B/op          0 allocs/op
BenchmarkAnd_XL-8               38915920                30.50 ns/op            0 B/op          0 allocs/op
BenchmarkAnd_S_Naive-8          186783776                6.564 ns/op           0 B/op          0 allocs/op
BenchmarkAnd_M_Naive-8          70419261                17.00 ns/op            0 B/op          0 allocs/op
BenchmarkAnd_XL_Naive-8         17308191                68.91 ns/op            0 B/op          0 allocs/op
BenchmarkAnd_M_V3-8             100000000               11.38 ns/op            0 B/op          0 allocs/op
BenchmarkAnd_XL_V3-8            36936477                32.07 ns/op            0 B/op          0 allocs/op
BenchmarkAnd_M_V3_Naive-8       52957920                22.63 ns/op            0 B/op          0 allocs/op
BenchmarkAnd_XL_V3_Naive-8      13048790                89.38 ns/op            0 B/op          0 allocs/op
Graviton 2
goos: linux
goarch: arm64
pkg: github.com/viant/vec/bitwise
BenchmarkAnd_S_Arm64Neon-16     	148614336	         8.077 ns/op	   0 B/op	       0 allocs/op
BenchmarkAnd_M_Arm64Neon-16     	68040201	        17.62 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_XL_Arm64Neon-16    	21396996	        56.07 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_M_V3_Neon-16       	22370042	        53.63 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_S-16               	136180149	         8.812 ns/op	   0 B/op	       0 allocs/op
BenchmarkAnd_M-16               	65123619	        18.42 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_XL-16              	21087504	        56.88 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_S_Naive-16         	75354376	        15.90 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_M_Naive-16         	26723798	        44.91 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_XL_Naive-16        	 7453200	       161.0 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_M_V3-16            	22334678	        53.42 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_XL_V3-16           	 6257778	       192.0 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_M_V3_Naive-16      	20654671	        58.09 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_XL_V3_Naive-16     	 5529211	       221.2 ns/op	       0 B/op	       0 allocs/op
Graviton 3
BenchmarkAnd_S_Arm64SVE
BenchmarkAnd_S_Arm64SVE-48      	283087328	         4.239 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_M_Arm64SVE
BenchmarkAnd_M_Arm64SVE-48      	100000000	        10.04 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_XL_Arm64SVE
BenchmarkAnd_XL_Arm64SVE-48     	41910331	        28.58 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_S_Arm64Neon
BenchmarkAnd_S_Arm64Neon-48     	224370678	         5.349 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_M_Arm64Neon
BenchmarkAnd_M_Arm64Neon-48     	92252922	        13.00 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_XL_Arm64Neon
BenchmarkAnd_XL_Arm64Neon-48    	29059210	        41.26 ns/op	       0 B/op	       0 allocs/op
BenchmarkOr_M_V3_Neon
BenchmarkOr_M_V3_Neon-48        	71975278	        16.66 ns/op	       0 B/op	       0 allocs/op
BenchmarkOr_M_V4_Neon
BenchmarkOr_M_V4_Neon-48        	53198659	        22.52 ns/op	       0 B/op	       0 allocs/op
BenchmarkOr_M_V5_Neon
BenchmarkOr_M_V5_Neon-48        	40883546	        29.32 ns/op	       0 B/op	       0 allocs/op
BenchmarkOr_M_V3_SVE
BenchmarkOr_M_V3_SVE-48         	95986852	        12.52 ns/op	       0 B/op	       0 allocs/op
BenchmarkOr_M_V4_SVE
BenchmarkOr_M_V4_SVE-48         	75589498	        15.97 ns/op	       0 B/op	       0 allocs/op
BenchmarkOr_M_V5_SVE
BenchmarkOr_M_V5_SVE-48         	63350295	        18.55 ns/op	       0 B/op	       0 allocs/op
BenchmarkOr_M_V6_SVE
BenchmarkOr_M_V6_SVE-48         	53549683	        22.27 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_S
BenchmarkAnd_S-48               	266749639	         4.530 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_M
BenchmarkAnd_M-48               	100000000	        10.64 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_XL
BenchmarkAnd_XL-48              	39472730	        30.16 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_S_Naive
BenchmarkAnd_S_Naive-48         	223677896	         5.365 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_M_Naive
BenchmarkAnd_M_Naive-48         	60182758	        20.01 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_XL_Naive
BenchmarkAnd_XL_Naive-48        	15832388	        76.02 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_M_V3
BenchmarkAnd_M_V3-48            	91067534	        13.35 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_XL_V3
BenchmarkAnd_XL_V3-48           	34992511	        34.38 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_M_V3_Naive
BenchmarkAnd_M_V3_Naive-48      	44016760	        27.25 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_XL_V3_Naive
BenchmarkAnd_XL_V3_Naive-48     	11348571	       105.9 ns/op	       0 B/op	       0 allocs/op

AMD64 (AVX2/AVX512)

Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz

BenchmarkAnd_S_AVX2-16          296558868                4.015 ns/op           0 B/op          0 allocs/op
BenchmarkAnd_M_AVX2-16          164493822                6.343 ns/op           0 B/op          0 allocs/op
BenchmarkAnd_XL_AVX2-16         56744065                18.01 ns/op            0 B/op          0 allocs/op
BenchmarkAnd_S-16               269001751                4.854 ns/op           0 B/op          0 allocs/op
BenchmarkAnd_M-16               148436960                8.201 ns/op           0 B/op          0 allocs/op
BenchmarkAnd_XL-16              59192054                21.08 ns/op            0 B/op          0 allocs/op
BenchmarkAnd_S_Naive-16         146186683                7.764 ns/op           0 B/op          0 allocs/op
BenchmarkAnd_M_Naive-16         61259191                20.51 ns/op            0 B/op          0 allocs/op
BenchmarkAnd_XL_Naive-16        13925252                76.87 ns/op            0 B/op          0 allocs/op
BenchmarkAnd_M_V3-16            141754650                8.224 ns/op           0 B/op          0 allocs/op
BenchmarkAnd_XL_V3-16           55797906                23.16 ns/op            0 B/op          0 allocs/op
BenchmarkAnd_M_V3_Naive-16      48094837                25.76 ns/op            0 B/op          0 allocs/op
BenchmarkAnd_XL_V3_Naive-16     11524884                97.60 ns/op            0 B/op          0 allocs/op

Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz

BenchmarkAnd_S_AVX2-16         	237390595	         5.024 ns/op	   0 B/op	       0 allocs/op
BenchmarkAnd_M_AVX2-16         	150596898	         7.963 ns/op	   0 B/op	       0 allocs/op
BenchmarkAnd_XL_AVX2-16        	53840293	        22.12 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_S_AVX512-16       	141063717	         8.508 ns/op	   0 B/op	       0 allocs/op
BenchmarkAnd_M_AVX512-16       	51851728	        23.09 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_XL_AVX512-16      	13561696	        88.42 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_M_V3_AVX2-16      	127001692	         9.449 ns/op	   0 B/op	       0 allocs/op
BenchmarkAnd_M_V3_AVX512-16    	49551648	        24.16 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_S-16              	204319275	         5.795 ns/op	   0 B/op	       0 allocs/op
BenchmarkAnd_M-16              	135662316	         8.845 ns/op	   0 B/op	       0 allocs/op
BenchmarkAnd_XL-16             	52037328	        23.00 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_S_Naive-16        	155402485	         7.597 ns/op	   0 B/op	       0 allocs/op
BenchmarkAnd_M_Naive-16        	53635558	        22.28 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_XL_Naive-16       	13346347	        90.31 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_M_V3-16           	100000000	        10.03 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_XL_V3-16          	49549333	        24.19 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_M_V3_Naive-16     	40608735	        29.53 ns/op	       0 B/op	       0 allocs/op
BenchmarkAnd_XL_V3_Naive-16    	10238077	       117.1 ns/op	       0 B/op	       0 allocs/op

# Type aliases

Uint64s defines Uint64s.