Abstract:
Optimising strlen function using SSE2 SIMD instructions.

Created by Dmitry Kostjuchenko
Last changed
Contributors: Peter Kankowski and Nazo
Filed under Low-level code optimization

Share on social sitesReddit Digg Delicious Buzz Facebook Twitter

SSE2 optimised strlen

Besides the technique of unrolling the loop and thus introducing performance boost for strlen function explained in Optimized strlen function it is also possible to go a little bit further and try to optimise strlen with SSE2 SIMD instructions to make it even more faster.

SSE2 instructions provide ability to execute single operation on multiple variables simultaneously and we can try to use such behaviour to optimise a stage of searching for 0 in the string.

Optimized strlen function did provide unrolling but 4 conditions are processed by separate CPU instructions if compiled for 32-bit software platform and 8! for 64-bit, so we can try to optimise it into just 2 SSE2 instructions:

  1. PCMPEQB (_mm_cmpeq_epi8): compares 8-bit integers for equality (we will use it to get position of 0 in 16 positions (8-bit numbers))
  2. PMOVMSKB (_mm_movemask_epi8): creates 16-bit mask from the most significant bits of 8-bit integers (we use it to get integral mask with 0 positions expressed in unset bits for simplified comparison and final phase 'count_bits_to_0')

Implementation:

#ifndef WORDS_BIGENDIAN
    #if 0
        static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
        {
            register int i = 0;
            if (!(x & (1 << 0))) i ++;
            else return i;
            if (!(x & (1 << 1))) i ++;
            else return i;
            if (!(x & (1 << 2))) i ++;
            else return i;
            if (!(x & (1 << 3))) i ++;
            else return i;
            if (!(x & (1 << 4))) i ++;
            else return i;
            if (!(x & (1 << 5))) i ++;
            else return i;
            if (!(x & (1 << 6))) i ++;
            else return i;
            if (!(x & (1 << 7))) i ++;
            else return i;
            if (!(x & (1 << 8))) i ++;
            else return i;
            if (!(x & (1 << 9))) i ++;
            else return i;
            if (!(x & (1 << 10))) i ++;
            else return i;
            if (!(x & (1 << 11))) i ++;
            else return i;
            if (!(x & (1 << 12))) i ++;
            else return i;
            if (!(x & (1 << 13))) i ++;
            else return i;
            if (!(x & (1 << 14))) i ++;
            else return i;
            if (!(x & (1 << 15))) i ++;
            return i;
        }
    #elif 0
        static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
        {
            // http://www.hackersdelight.org/: ntz3() shortened for 16-bit mask by Peter Kankowski
            register int n = 1;
            if ((x & 0x000000FFU) == 0) {n += 8; x >>= 8;}
            if ((x & 0x0000000FU) == 0) {n += 4; x >>= 4;}
            if ((x & 0x00000003U) == 0) {n += 2; x >>= 2;}
            return n - (x & 1);
        }
    #else
        static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes, by Nazo, post: 2009/07/20 03:40
        {                                                 // this is current winner for speed
            static const unsigned char table[256] = 
            {
                7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
            };
            if ((unsigned char)x)
                return table[(unsigned char)x];
            return table[x >> 8] + 8; // t[x / 256] + 8
        }
    #endif
#else
    #if 0
        static inline int count_bits_to_0(unsigned int x)  // counting trailing zeroes
        {
            register int i = 0;
            if (!(x & (1 << 15))) i ++;
            else return i;
            if (!(x & (1 << 14))) i ++;
            else return i;
            if (!(x & (1 << 13))) i ++;
            else return i;
            if (!(x & (1 << 12))) i ++;
            else return i;
            if (!(x & (1 << 11))) i ++;
            else return i;
            if (!(x & (1 << 10))) i ++;
            else return i;
            if (!(x & (1 << 9))) i ++;
            else return i;
            if (!(x & (1 << 8))) i ++;
            else return i;
            if (!(x & (1 << 7))) i ++;
            else return i;
            if (!(x & (1 << 6))) i ++;
            else return i;
            if (!(x & (1 << 5))) i ++;
            else return i;
            if (!(x & (1 << 4))) i ++;
            else return i;
            if (!(x & (1 << 3))) i ++;
            else return i;
            if (!(x & (1 << 2))) i ++;
            else return i;
            if (!(x & (1 << 1))) i ++;
            else return i;
            if (!(x & (1 << 0))) i ++;
            return i;
        }
    #else
        static inline int count_bits_to_0(unsigned int x)  // counting trailing zeroes
        {
            // http://www.hackersdelight.org/: nlz1() shortened for 16-bit mask
            register int n = 0;
            if (x <= 0x000000FFU) {n = n + 8; x = x << 8;}
            if (x <= 0x00000FFFU) {n = n + 4; x = x << 4;}
            if (x <= 0x00003FFFU) {n = n + 2; x = x << 2;}
            if (x <= 0x00007FFFU) {n = n + 1;}
            return n;
        }
    #endif
#endif
size_t strlen(const char *str)
{
    register size_t len = 0;
    // align to 16 bytes
    while ((((intptr_t)str) & (sizeof(__m128i)-1)) != 0)
    {
        if (*str++ == 0)
            return len;
        ++ len;
    }
    // search for 0
    __m128i xmm0 = _mm_setzero_si128();
    __m128i xmm1;
    int mask = 0;
    for (;;)
    {
        xmm1 = _mm_load_si128((__m128i *)str);
        xmm1 = _mm_cmpeq_epi8(xmm1, xmm0);
        if ((mask = _mm_movemask_epi8(xmm1)) != 0)
        {
            // got 0 somewhere within 16 bytes in xmm1, or within 16 bits in mask
            // find index of first set bit

        #ifndef _DISABLE_ASM_BSF // define it to disable ASM
            #if (_MSC_VER >= 1300)   // make sure <intrin.h> is included
                unsigned long pos;
                _BitScanForward(&pos, mask);
                len += (size_t)pos;
            #elif defined(_MSC_VER)  // earlier MSVC's do not have _BitScanForward, use inline asm
                __asm bsf edx, mask ; edx = bsf(mask)
                __asm add edx, len  ; edx += len
                __asm mov len, edx  ; len = edx
            #elif ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))) // modern GCC has built-in __builtin_ctz
                len += __builtin_ctz(mask);
            #elif defined(__GNUC__) // older GCC shall use inline asm
                unsigned int pos;
                asm("bsf %1, %0" : "=r" (pos) : "rm" (mask));
                len += (size_t)pos;
            #else                    // none of choices exist, use local BSF implementation
                len += count_bits_to_0(mask);
            #endif
        #else
            len += count_bits_to_0(mask);
        #endif

            break;
        }
        str += sizeof(__m128i);
        len += sizeof(__m128i);
    }
    return len;
}

This implementation would win more performance boost if 'count_bits_to_0' is optimised in less conditions.

We could use _mm_loadu_si128 to load unaligned data and thus skip own aligning loop but the performance will still be worse due to additional CPU cycles if _mm_loadu_si128 is used.

SSE2 SIMD instructions are present on all modern CPUs and thus this implementation may bring real benefits to intensive database/text processing applications.

License: Public Domain.

About the author

Dmitry Kostjuchenko

I am a developer of own commercial project (http://www.iauxsoft.com) that is devoted for sound, database, network and input device events processing. Since 5-th class of the school I was in programming on Basic on BK personal computer, many who studied in the USSR school in 90-th shall remember this :) and DVK as server for them :))) I like optimising code and inventing a bicycle and can't do anything about it.

Created by Dmitry Kostjuchenko
Last changed
Contributors: Peter Kankowski and Nazo

26 comments

Ten recent comments are shown below. Show all comments

ace,

I don't have these new MSVC compilers here, but I guess that they really don't use size_t at all, instead they recognize strlen as an intrinsic so the code optimization phase reduces the constant expression over the constant sequence of characters. I don't expect that it's so common among compilers. Anybody tried anything else? Anyway, am I right that if the string is "ab\0cd" the result will be 2 in these new MSVC compilers, which proves that they don't use size_t?

Peter Kankowski,

We probably misunderstood each other: size_t is not some kind of optimization, but an unsigned integer type to store the size of an object.

For "ab\0cd", push 2 is generated by MSVC. GCC doesn't support this optimization (as of GCC 4.4).

ace,
size_t is not some kind of optimization

Lapsus calami. I've meant "sizeof." sizeof( "ab\0cd" ) - 1 is of course 5. Applying strlen results in 2. Newer MSVC compilers obviously really use strlen to calculate the result. I'd also like to know if the optimization is wired to specific function or if they really can recognize some function without side effects? What happens if you make your own "int mystrlen( char* s )" and you call it the same way as before?

PRR,

Below is the code generated with XCode 3.2 (Mac OS X 10.6 Snow Leopard, GCC).

int main (int argc, const char * argv[]) {
    return strlen("SomeString");
}
0x00001f8a  <+0000>  push   %ebp
0x00001f8b  <+0001>  mov    %esp,%ebp
0x00001f8d  <+0003>  mov    $0xb,%eax
0x00001f92  <+0008>  leave  
0x00001f93  <+0009>  ret

And it works correctly in case of return strlen("Some\0String"), moving 4 to eax.

PRR,

Sorry for badly formatted post. Please feel free to adjust it.

Peter Kankowski,

Thanks, that's interesting. It appears that for GCC:

strlen("SomeString") // is optimized
 
char * s = "SomeString"; // is optimized
return strlen(s);
 
const char * s = "SomeString"; // is optimized
return strlen(s);
 
static const char * s = "SomeString"; // is not optimized
return strlen(s);

You can register if you want to edit your comments and post own articles.

Ace, it's wired to strlen. MSVC++ and GCC cannot recognize functions without side effects. BTW, I'm writing an article on a related topic now :)

ace,
don't be afraid of using strlen, there is no need to optimize it by hand using sizeof.

One more question from somebody who doesn't have latest MSVC at hand, can strlen be a part of the constant expressions which are then used for the things that have to be known at the compile time, like the size of the array on the stack? On my MSVC 6, I can do

char s[] = "something";
char a[ 2 * sizeof( s ) ];
Peter Kankowski,

No, using functions in constant expressions would violate C/C++ standard, but it would be a very useful feature.

ninjalj,
static const char * s = "SomeString"; // is not optimized

That's because the s pointer can be modified by other code. Just add a const qualifier to the pointer, and GCC should optimize it:

static const char *const s = "SomeString"; // is optimized

And about GCC not recognizing functions without side effects, you can use GCC's function attributes to explicitly mark them __attribute__ ((pure)) for functions without side effects, and __attribute__ ((const)) for functions without side effects that don't depend on global memory (strlen() is pure, but not const, since it has no side effects but has to access the string pointed to by its argument).

Peter Kankowski,

Ninjalj, thank you very much for the information.

Your name:


Comment:


Please ignore this field: