/* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /// \file FastArraySearch.cpp /// SIMD-accelerated linear search for typed arrays. /// /// Each search function processes the array in 128-bit (16-byte) chunks using /// SIMD, then falls back to scalar for the remaining tail elements. The number /// of elements per chunk depends on the element width: /// - uint8_t: 16 elements per chunk /// - uint16_t: 8 elements per chunk /// - uint32_t: 4 elements per chunk /// - uint64_t: 2 elements per chunk /// /// NEON (aarch64) strategy: /// 1. Broadcast the target into every lane of a 128-bit vector (vdupq_n_*). /// 2. Load 128 bits of array data (vld1q_*). /// 3. Compare element-wise: result lanes are all-ones on match, zero /// otherwise (vceqq_*). /// 4. Quick any-match check: /// - For u8/u16/u32: vmaxvq_* reduces the comparison vector to its /// maximum lane value. Non-zero means at least one lane matched. /// - For u64: extract individual lanes (vgetq_lane_u64) since there are /// only 2 lanes. /// 5. On a hit, collapse to a scalar bitmask by ANDing the comparison lanes /// with per-lane power-of-2 bit values {1, 2, 4, ...} and summing /// (vaddvq_*), then use countTrailingZeros (forward) or countLeadingZeros /// (reverse) to find the matching element position. /// /// SSE2 (x86-64) strategy: /// 1. Broadcast the target into every lane (_mm_set1_epi*). /// 2. Load 128 bits of array data (_mm_loadu_si128, unaligned). /// 3. Compare element-wise (_mm_cmpeq_epi*): result lanes are all-ones on /// match, zero otherwise. /// 4. Collapse the comparison into a 16-bit bitmask where each bit /// corresponds to one *byte* of the 128-bit result (_mm_movemask_epi8). /// Non-zero means at least one lane matched. /// 5. On a hit, find the matching element position from the bitmask: /// - For u8: each element produces exactly 1 mask bit, so /// countTrailingZeros(mask) gives the byte (= element) index directly. /// - For u16: each element produces 2 mask bits, so divide by 2. /// - For u32: each element produces 4 mask bits, so divide by 4. /// For reverse searches, countLeadingZeros is used to find the highest /// set bit instead. /// /// For u64 on SSE2, there is no _mm_cmpeq_epi64 intrinsic, so the search /// falls through to the scalar path. #include "hermes/Support/FastArraySearch.h" #include "hermes/Support/SIMD.h" #include "llvh/Support/MathExtras.h" #include namespace hermes { namespace { /// Scalar linear search over [start, end) of \p arr for \p target. /// \tparam T the element type. /// \tparam Reverse when true, search backward returning the last match; /// when false, search forward returning the first match. /// \return the index of the match, or -1 if not found. template int64_t scalarSearch(llvh::ArrayRef arr, size_t start, size_t end, T target) { if constexpr (Reverse) { for (size_t i = end; i > start;) { --i; if (arr[i] == target) return static_cast(i); } } else { for (size_t i = start; i < end; ++i) { if (arr[i] == target) return static_cast(i); } } return -1; } //===----------------------------------------------------------------------===// // u8 search implementation //===----------------------------------------------------------------------===// /// SIMD-accelerated linear search over [start, end) of a uint8_t array. /// Uses NEON (aarch64) or SSE2 (x86-64) when available, with a scalar tail /// for remaining elements. /// \tparam Reverse when true, search backward returning the last match; /// when false, search forward returning the first match. /// \return the index of the match, or -1 if not found. template int64_t searchU8Impl( llvh::ArrayRef arr, size_t start, size_t end, uint8_t target) { assert(start <= end && "start must be <= end"); assert(end <= arr.size() && "end exceeds array bounds"); size_t len = end - start; #ifdef HERMES_SIMD_NEON // Broadcast target into all sixteen 8-bit lanes. uint8x16_t needle = vdupq_n_u8(target); // Per-lane bit values for collapsing comparison lanes into a scalar bitmask // on a hit. Since each lane is only 8 bits, the maximum power-of-2 is // 128 = 2^7, so we process the 16 lanes as two groups of 8. // MSVC doesn't support constexpr initializer for NEON vector types so we // use vld1 here. static constexpr uint8_t kLaneBitsArr[] = {1, 2, 4, 8, 16, 32, 64, 128}; static const uint8x8_t kLaneBits = vld1_u8(kLaneBitsArr); size_t numChunks = len / 16; for (size_t c = 0; c < numChunks; ++c) { size_t base = Reverse ? end - (c + 1) * 16 : start + c * 16; // Load 16 consecutive bytes. uint8x16_t data = vld1q_u8(arr.data() + base); // Per-lane equality: matching lanes become 0xFF, others 0. uint8x16_t cmp = vceqq_u8(data, needle); // Quick rejection: horizontal max is zero when no lane matched. if (vmaxvq_u8(cmp)) { // Collapse to a 16-bit scalar mask: split into two 8-lane halves, // AND each with {1,2,4,8,16,32,64,128}, sum each half to get an // 8-bit mask, then combine into a 16-bit mask. uint8_t lo = vaddv_u8(vand_u8(vget_low_u8(cmp), kLaneBits)); uint8_t hi = vaddv_u8(vand_u8(vget_high_u8(cmp), kLaneBits)); uint16_t mask = lo | (static_cast(hi) << 8); if constexpr (Reverse) return static_cast( base + (15 - llvh::countLeadingZeros(mask))); else return static_cast(base + llvh::countTrailingZeros(mask)); } } size_t tailLen = len % 16; #elif defined(HERMES_SIMD_SSE2) // Broadcast target into all sixteen 8-bit lanes. __m128i needle = _mm_set1_epi8(static_cast(target)); size_t numChunks = len / 16; for (size_t c = 0; c < numChunks; ++c) { size_t base = Reverse ? end - (c + 1) * 16 : start + c * 16; __m128i data = _mm_loadu_si128(reinterpret_cast(arr.data() + base)); // Per-lane equality: matching lanes become 0xFF, others 0. __m128i cmp = _mm_cmpeq_epi8(data, needle); // Each byte produces exactly 1 mask bit, so the mask bit position // equals the element index directly. unsigned mask = static_cast(_mm_movemask_epi8(cmp)); if (mask) { if constexpr (Reverse) { // Highest set bit = last matching byte index. unsigned pos = 31 - llvh::countLeadingZeros(mask); return static_cast(base + pos); } else { unsigned pos = llvh::countTrailingZeros(mask); return static_cast(base + pos); } } } size_t tailLen = len % 16; #else size_t tailLen = len; #endif // Scalar tail. if constexpr (Reverse) return scalarSearch(arr, start, start + tailLen, target); else return scalarSearch(arr, end - tailLen, end, target); } //===----------------------------------------------------------------------===// // u16 search implementation //===----------------------------------------------------------------------===// /// SIMD-accelerated linear search over [start, end) of a uint16_t array. /// Uses NEON (aarch64) or SSE2 (x86-64) when available, with a scalar tail /// for remaining elements. /// \tparam Reverse when true, search backward returning the last match; /// when false, search forward returning the first match. /// \return the index of the match, or -1 if not found. template int64_t searchU16Impl( llvh::ArrayRef arr, size_t start, size_t end, uint16_t target) { assert(start <= end && "start must be <= end"); assert(end <= arr.size() && "end exceeds array bounds"); size_t len = end - start; #ifdef HERMES_SIMD_NEON // Broadcast target into all eight 16-bit lanes. uint16x8_t needle = vdupq_n_u16(target); // Per-lane bit values for collapsing comparison lanes into a scalar bitmask // on a hit. // MSVC doesn't support constexpr initializer for NEON vector types so we // use vld1q here. static constexpr uint16_t kLaneBitsArr[] = {1, 2, 4, 8, 16, 32, 64, 128}; static const uint16x8_t kLaneBits = vld1q_u16(kLaneBitsArr); size_t numChunks = len / 8; for (size_t c = 0; c < numChunks; ++c) { size_t base = Reverse ? end - (c + 1) * 8 : start + c * 8; // Load 8 consecutive uint16_t elements. uint16x8_t data = vld1q_u16(arr.data() + base); // Per-lane equality: matching lanes become 0xFFFF, others 0. uint16x8_t cmp = vceqq_u16(data, needle); // Quick rejection: horizontal max is zero when no lane matched. if (vmaxvq_u16(cmp)) { // Collapse to an 8-bit scalar mask: AND with {1,2,4,...,128} and sum, // then use ctz/clz to find the first/last match. uint16_t mask = vaddvq_u16(vandq_u16(cmp, kLaneBits)); if constexpr (Reverse) return static_cast( base + (15 - llvh::countLeadingZeros(mask))); else return static_cast(base + llvh::countTrailingZeros(mask)); } } size_t tailLen = len % 8; #elif defined(HERMES_SIMD_SSE2) // Broadcast target into all eight 16-bit lanes. __m128i needle = _mm_set1_epi16(static_cast(target)); size_t numChunks = len / 8; for (size_t c = 0; c < numChunks; ++c) { size_t base = Reverse ? end - (c + 1) * 8 : start + c * 8; __m128i data = _mm_loadu_si128(reinterpret_cast(arr.data() + base)); // Per-lane equality: matching lanes become 0xFFFF, others 0. __m128i cmp = _mm_cmpeq_epi16(data, needle); // Each 16-bit lane produces 2 mask bits, so divide the bit // position by 2 to get the element index. unsigned mask = static_cast(_mm_movemask_epi8(cmp)); if (mask) { if constexpr (Reverse) { // Highest set bit / 2 = last matching element index. unsigned pos = (31 - llvh::countLeadingZeros(mask)) / 2; return static_cast(base + pos); } else { unsigned pos = llvh::countTrailingZeros(mask) / 2; return static_cast(base + pos); } } } size_t tailLen = len % 8; #else size_t tailLen = len; #endif // Scalar tail. if constexpr (Reverse) return scalarSearch(arr, start, start + tailLen, target); else return scalarSearch(arr, end - tailLen, end, target); } //===----------------------------------------------------------------------===// // u32 search implementation //===----------------------------------------------------------------------===// /// SIMD-accelerated linear search over [start, end) of a uint32_t array. /// Uses NEON (aarch64) or SSE2 (x86-64) when available, with a scalar tail /// for remaining elements. /// \tparam Reverse when true, search backward returning the last match; /// when false, search forward returning the first match. /// \return the index of the match, or -1 if not found. template int64_t searchU32Impl( llvh::ArrayRef arr, size_t start, size_t end, uint32_t target) { assert(start <= end && "start must be <= end"); assert(end <= arr.size() && "end exceeds array bounds"); size_t len = end - start; #ifdef HERMES_SIMD_NEON // Broadcast target into all four 32-bit lanes. uint32x4_t needle = vdupq_n_u32(target); // Per-lane bit values used to collapse comparison lanes into a scalar // bitmask on a hit: AND the all-ones/zero lanes with {1,2,4,8} then sum. // MSVC doesn't support constexpr initializer for NEON vector types so we // use vld1q here. static constexpr uint32_t kLaneBitsArr[] = {1, 2, 4, 8}; static const uint32x4_t kLaneBits = vld1q_u32(kLaneBitsArr); size_t numChunks = len / 4; for (size_t c = 0; c < numChunks; ++c) { size_t base = Reverse ? end - (c + 1) * 4 : start + c * 4; // Load 4 consecutive uint32_t elements. uint32x4_t data = vld1q_u32(arr.data() + base); // Per-lane equality: matching lanes become 0xFFFFFFFF, others 0. uint32x4_t cmp = vceqq_u32(data, needle); // Quick rejection: horizontal max is zero when no lane matched. if (vmaxvq_u32(cmp)) { // Collapse to a 4-bit scalar mask where bit N is set iff lane N // matched, then use ctz/clz to find the first/last match. uint32_t mask = vaddvq_u32(vandq_u32(cmp, kLaneBits)); if constexpr (Reverse) return static_cast( base + (31 - llvh::countLeadingZeros(mask))); else return static_cast(base + llvh::countTrailingZeros(mask)); } } size_t tailLen = len % 4; #elif defined(HERMES_SIMD_SSE2) // Broadcast target into all four 32-bit lanes. __m128i needle = _mm_set1_epi32(static_cast(target)); size_t numChunks = len / 4; for (size_t c = 0; c < numChunks; ++c) { size_t base = Reverse ? end - (c + 1) * 4 : start + c * 4; __m128i data = _mm_loadu_si128(reinterpret_cast(arr.data() + base)); // Per-lane equality: matching lanes become 0xFFFFFFFF, others 0. __m128i cmp = _mm_cmpeq_epi32(data, needle); // Extract the MSB of each byte into a 16-bit mask. // Each 32-bit lane spans 4 bytes, so a matching lane sets 4 // consecutive mask bits (a nibble like 0xF). unsigned mask = static_cast(_mm_movemask_epi8(cmp)); if (mask) { // Each 32-bit lane produces 4 mask bits, so divide the bit // position by 4 to get the element index. if constexpr (Reverse) return static_cast( base + (31 - llvh::countLeadingZeros(mask)) / 4); else return static_cast(base + llvh::countTrailingZeros(mask) / 4); } } size_t tailLen = len % 4; #else size_t tailLen = len; #endif // Scalar tail. if constexpr (Reverse) return scalarSearch(arr, start, start + tailLen, target); else return scalarSearch(arr, end - tailLen, end, target); } //===----------------------------------------------------------------------===// // u64 search implementation //===----------------------------------------------------------------------===// /// SIMD-accelerated linear search over [start, end) of a uint64_t array. /// Uses NEON (aarch64) when available, with a scalar fallback otherwise /// (SSE2 lacks a 64-bit integer compare). /// \tparam Reverse when true, search backward returning the last match; /// when false, search forward returning the first match. /// \return the index of the match, or -1 if not found. template int64_t searchU64Impl( llvh::ArrayRef arr, size_t start, size_t end, uint64_t target) { assert(start <= end && "start must be <= end"); assert(end <= arr.size() && "end exceeds array bounds"); size_t len = end - start; #ifdef HERMES_SIMD_NEON // Broadcast target into both 64-bit lanes. uint64x2_t needle = vdupq_n_u64(target); size_t numChunks = len / 2; for (size_t c = 0; c < numChunks; ++c) { size_t base = Reverse ? end - (c + 1) * 2 : start + c * 2; uint64x2_t data = vld1q_u64(arr.data() + base); // Per-lane equality: matching lanes become all-ones. uint64x2_t cmp = vceqq_u64(data, needle); // No vmaxvq_u64, so extract each lane individually. uint64_t lane0 = vgetq_lane_u64(cmp, 0); uint64_t lane1 = vgetq_lane_u64(cmp, 1); if constexpr (Reverse) { if (lane1) return static_cast(base + 1); if (lane0) return static_cast(base); } else { if (lane0) return static_cast(base); if (lane1) return static_cast(base + 1); } } size_t tailLen = len % 2; #else // SSE2 lacks a 64-bit integer compare (_mm_cmpeq_epi64 is SSE4.1), // so fall through to the scalar path. size_t tailLen = len; #endif // Scalar tail. if constexpr (Reverse) return scalarSearch(arr, start, start + tailLen, target); else return scalarSearch(arr, end - tailLen, end, target); } } // namespace //===----------------------------------------------------------------------===// // Public API //===----------------------------------------------------------------------===// int64_t searchU8( llvh::ArrayRef arr, size_t start, size_t end, uint8_t target) { return searchU8Impl(arr, start, end, target); } int64_t searchReverseU8( llvh::ArrayRef arr, size_t start, size_t end, uint8_t target) { return searchU8Impl(arr, start, end, target); } int64_t searchU16( llvh::ArrayRef arr, size_t start, size_t end, uint16_t target) { return searchU16Impl(arr, start, end, target); } int64_t searchReverseU16( llvh::ArrayRef arr, size_t start, size_t end, uint16_t target) { return searchU16Impl(arr, start, end, target); } int64_t searchU32( llvh::ArrayRef arr, size_t start, size_t end, uint32_t target) { return searchU32Impl(arr, start, end, target); } int64_t searchReverseU32( llvh::ArrayRef arr, size_t start, size_t end, uint32_t target) { return searchU32Impl(arr, start, end, target); } int64_t searchU64( llvh::ArrayRef arr, size_t start, size_t end, uint64_t target) { return searchU64Impl(arr, start, end, target); } int64_t searchReverseU64( llvh::ArrayRef arr, size_t start, size_t end, uint64_t target) { return searchU64Impl(arr, start, end, target); } } // namespace hermes