#line 1 "numpy/core/src/npysort/radixsort.c.src"

/*
 *****************************************************************************
 **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
 **       Changes should be made to the original source (.src) file         **
 *****************************************************************************
 */

#line 1
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "npy_sort.h"
#include "npysort_common.h"
#include <stdlib.h>

/*
 *****************************************************************************
 **                            INTEGER SORTS                                **
 *****************************************************************************
 */


#line 25

// Reference: https://github.com/eloj/radix-sorting#-key-derivation
#if 0
    // Floating-point is currently disabled.
    // Floating-point tests succeed for double and float on macOS but not on Windows/Linux.
    // Basic sorting tests succeed but others relying on sort fail.
    // Possibly related to floating-point normalisation or multiple NaN reprs? Not sure.
    #if 0
        // For floats, we invert the key if the sign bit is set, else we invert the sign bit.
        #define KEY_OF(x) ((x) ^ (-((x) >> (sizeof(npy_ubyte) * 8 - 1)) | ((npy_ubyte)1 << (sizeof(npy_ubyte) * 8 - 1))))
    #else
        // For signed ints, we flip the sign bit so the negatives are below the positives.
        #define KEY_OF(x) ((x) ^ ((npy_ubyte)1 << (sizeof(npy_ubyte) * 8 - 1)))
    #endif
#else
    // For unsigned ints, the key is as-is
    #define KEY_OF(x) (x)
#endif

static inline npy_ubyte
nth_byte_bool(npy_ubyte key, npy_intp l) {
    return (key >> (l << 3)) & 0xFF;
}

static npy_ubyte*
radixsort0_bool(npy_ubyte *arr, npy_ubyte *aux, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ubyte)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ubyte key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ubyte)];

    for (i = 0; i < num; i++) {
        npy_ubyte k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ubyte); l++) {
            cnt[l][nth_byte_bool(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ubyte); l++) {
	    if (cnt[l][nth_byte_bool(key0, l)] != num) {
	        cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_ubyte* temp;
        for (i = 0; i < num; i++) {
            npy_ubyte k = KEY_OF(arr[i]);
            npy_intp dst = cnt[cols[l]][nth_byte_bool(k, cols[l])]++;
            aux[dst] = arr[i];
        }

        temp = aux;
        aux = arr;
        arr = temp;
    }

    return arr;
}

NPY_NO_EXPORT int
radixsort_bool(void *start, npy_intp num, void *NPY_UNUSED(varr))
{
    void *sorted;
    npy_ubyte *aux;
    npy_ubyte *arr = start;
    npy_ubyte k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[0]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[i]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_ubyte));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = radixsort0_bool(start, aux, num);
    if (sorted != start) {
        memcpy(start, sorted, num * sizeof(npy_ubyte));
    }

    free(aux);
    return 0;
}

static npy_intp*
aradixsort0_bool(npy_ubyte *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ubyte)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ubyte key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ubyte)];

    for (i = 0; i < num; i++) {
        npy_ubyte k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ubyte); l++) {
            cnt[l][nth_byte_bool(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ubyte); l++) {
        if (cnt[l][nth_byte_bool(key0, l)] != num) {
            cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp* temp;
        for (i = 0; i < num; i++) {
            npy_ubyte k = KEY_OF(arr[tosort[i]]);
            npy_intp dst = cnt[cols[l]][nth_byte_bool(k, cols[l])]++;
            aux[dst] = tosort[i];
        }

        temp = aux;
        aux = tosort;
        tosort = temp;
    }

    return tosort;
}

NPY_NO_EXPORT int
aradixsort_bool(void *start, npy_intp* tosort, npy_intp num, void *NPY_UNUSED(varr))
{
    npy_intp *sorted;
    npy_intp *aux;
    npy_ubyte *arr = start;
    npy_ubyte k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[tosort[0]]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[tosort[i]]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_intp));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = aradixsort0_bool(start, aux, tosort, num);
    if (sorted != tosort) {
        memcpy(tosort, sorted, num * sizeof(npy_intp));
    }

    free(aux);
    return 0;
}

#undef KEY_OF


#line 25

// Reference: https://github.com/eloj/radix-sorting#-key-derivation
#if 1
    // Floating-point is currently disabled.
    // Floating-point tests succeed for double and float on macOS but not on Windows/Linux.
    // Basic sorting tests succeed but others relying on sort fail.
    // Possibly related to floating-point normalisation or multiple NaN reprs? Not sure.
    #if 0
        // For floats, we invert the key if the sign bit is set, else we invert the sign bit.
        #define KEY_OF(x) ((x) ^ (-((x) >> (sizeof(npy_ubyte) * 8 - 1)) | ((npy_ubyte)1 << (sizeof(npy_ubyte) * 8 - 1))))
    #else
        // For signed ints, we flip the sign bit so the negatives are below the positives.
        #define KEY_OF(x) ((x) ^ ((npy_ubyte)1 << (sizeof(npy_ubyte) * 8 - 1)))
    #endif
#else
    // For unsigned ints, the key is as-is
    #define KEY_OF(x) (x)
#endif

static inline npy_ubyte
nth_byte_byte(npy_ubyte key, npy_intp l) {
    return (key >> (l << 3)) & 0xFF;
}

static npy_ubyte*
radixsort0_byte(npy_ubyte *arr, npy_ubyte *aux, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ubyte)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ubyte key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ubyte)];

    for (i = 0; i < num; i++) {
        npy_ubyte k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ubyte); l++) {
            cnt[l][nth_byte_byte(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ubyte); l++) {
	    if (cnt[l][nth_byte_byte(key0, l)] != num) {
	        cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_ubyte* temp;
        for (i = 0; i < num; i++) {
            npy_ubyte k = KEY_OF(arr[i]);
            npy_intp dst = cnt[cols[l]][nth_byte_byte(k, cols[l])]++;
            aux[dst] = arr[i];
        }

        temp = aux;
        aux = arr;
        arr = temp;
    }

    return arr;
}

NPY_NO_EXPORT int
radixsort_byte(void *start, npy_intp num, void *NPY_UNUSED(varr))
{
    void *sorted;
    npy_ubyte *aux;
    npy_ubyte *arr = start;
    npy_ubyte k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[0]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[i]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_ubyte));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = radixsort0_byte(start, aux, num);
    if (sorted != start) {
        memcpy(start, sorted, num * sizeof(npy_ubyte));
    }

    free(aux);
    return 0;
}

static npy_intp*
aradixsort0_byte(npy_ubyte *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ubyte)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ubyte key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ubyte)];

    for (i = 0; i < num; i++) {
        npy_ubyte k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ubyte); l++) {
            cnt[l][nth_byte_byte(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ubyte); l++) {
        if (cnt[l][nth_byte_byte(key0, l)] != num) {
            cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp* temp;
        for (i = 0; i < num; i++) {
            npy_ubyte k = KEY_OF(arr[tosort[i]]);
            npy_intp dst = cnt[cols[l]][nth_byte_byte(k, cols[l])]++;
            aux[dst] = tosort[i];
        }

        temp = aux;
        aux = tosort;
        tosort = temp;
    }

    return tosort;
}

NPY_NO_EXPORT int
aradixsort_byte(void *start, npy_intp* tosort, npy_intp num, void *NPY_UNUSED(varr))
{
    npy_intp *sorted;
    npy_intp *aux;
    npy_ubyte *arr = start;
    npy_ubyte k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[tosort[0]]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[tosort[i]]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_intp));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = aradixsort0_byte(start, aux, tosort, num);
    if (sorted != tosort) {
        memcpy(tosort, sorted, num * sizeof(npy_intp));
    }

    free(aux);
    return 0;
}

#undef KEY_OF


#line 25

// Reference: https://github.com/eloj/radix-sorting#-key-derivation
#if 0
    // Floating-point is currently disabled.
    // Floating-point tests succeed for double and float on macOS but not on Windows/Linux.
    // Basic sorting tests succeed but others relying on sort fail.
    // Possibly related to floating-point normalisation or multiple NaN reprs? Not sure.
    #if 0
        // For floats, we invert the key if the sign bit is set, else we invert the sign bit.
        #define KEY_OF(x) ((x) ^ (-((x) >> (sizeof(npy_ubyte) * 8 - 1)) | ((npy_ubyte)1 << (sizeof(npy_ubyte) * 8 - 1))))
    #else
        // For signed ints, we flip the sign bit so the negatives are below the positives.
        #define KEY_OF(x) ((x) ^ ((npy_ubyte)1 << (sizeof(npy_ubyte) * 8 - 1)))
    #endif
#else
    // For unsigned ints, the key is as-is
    #define KEY_OF(x) (x)
#endif

static inline npy_ubyte
nth_byte_ubyte(npy_ubyte key, npy_intp l) {
    return (key >> (l << 3)) & 0xFF;
}

static npy_ubyte*
radixsort0_ubyte(npy_ubyte *arr, npy_ubyte *aux, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ubyte)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ubyte key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ubyte)];

    for (i = 0; i < num; i++) {
        npy_ubyte k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ubyte); l++) {
            cnt[l][nth_byte_ubyte(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ubyte); l++) {
	    if (cnt[l][nth_byte_ubyte(key0, l)] != num) {
	        cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_ubyte* temp;
        for (i = 0; i < num; i++) {
            npy_ubyte k = KEY_OF(arr[i]);
            npy_intp dst = cnt[cols[l]][nth_byte_ubyte(k, cols[l])]++;
            aux[dst] = arr[i];
        }

        temp = aux;
        aux = arr;
        arr = temp;
    }

    return arr;
}

NPY_NO_EXPORT int
radixsort_ubyte(void *start, npy_intp num, void *NPY_UNUSED(varr))
{
    void *sorted;
    npy_ubyte *aux;
    npy_ubyte *arr = start;
    npy_ubyte k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[0]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[i]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_ubyte));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = radixsort0_ubyte(start, aux, num);
    if (sorted != start) {
        memcpy(start, sorted, num * sizeof(npy_ubyte));
    }

    free(aux);
    return 0;
}

static npy_intp*
aradixsort0_ubyte(npy_ubyte *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ubyte)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ubyte key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ubyte)];

    for (i = 0; i < num; i++) {
        npy_ubyte k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ubyte); l++) {
            cnt[l][nth_byte_ubyte(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ubyte); l++) {
        if (cnt[l][nth_byte_ubyte(key0, l)] != num) {
            cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp* temp;
        for (i = 0; i < num; i++) {
            npy_ubyte k = KEY_OF(arr[tosort[i]]);
            npy_intp dst = cnt[cols[l]][nth_byte_ubyte(k, cols[l])]++;
            aux[dst] = tosort[i];
        }

        temp = aux;
        aux = tosort;
        tosort = temp;
    }

    return tosort;
}

NPY_NO_EXPORT int
aradixsort_ubyte(void *start, npy_intp* tosort, npy_intp num, void *NPY_UNUSED(varr))
{
    npy_intp *sorted;
    npy_intp *aux;
    npy_ubyte *arr = start;
    npy_ubyte k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[tosort[0]]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[tosort[i]]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_intp));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = aradixsort0_ubyte(start, aux, tosort, num);
    if (sorted != tosort) {
        memcpy(tosort, sorted, num * sizeof(npy_intp));
    }

    free(aux);
    return 0;
}

#undef KEY_OF


#line 25

// Reference: https://github.com/eloj/radix-sorting#-key-derivation
#if 1
    // Floating-point is currently disabled.
    // Floating-point tests succeed for double and float on macOS but not on Windows/Linux.
    // Basic sorting tests succeed but others relying on sort fail.
    // Possibly related to floating-point normalisation or multiple NaN reprs? Not sure.
    #if 0
        // For floats, we invert the key if the sign bit is set, else we invert the sign bit.
        #define KEY_OF(x) ((x) ^ (-((x) >> (sizeof(npy_ushort) * 8 - 1)) | ((npy_ushort)1 << (sizeof(npy_ushort) * 8 - 1))))
    #else
        // For signed ints, we flip the sign bit so the negatives are below the positives.
        #define KEY_OF(x) ((x) ^ ((npy_ushort)1 << (sizeof(npy_ushort) * 8 - 1)))
    #endif
#else
    // For unsigned ints, the key is as-is
    #define KEY_OF(x) (x)
#endif

static inline npy_ubyte
nth_byte_short(npy_ushort key, npy_intp l) {
    return (key >> (l << 3)) & 0xFF;
}

static npy_ushort*
radixsort0_short(npy_ushort *arr, npy_ushort *aux, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ushort)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ushort key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ushort)];

    for (i = 0; i < num; i++) {
        npy_ushort k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ushort); l++) {
            cnt[l][nth_byte_short(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ushort); l++) {
	    if (cnt[l][nth_byte_short(key0, l)] != num) {
	        cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_ushort* temp;
        for (i = 0; i < num; i++) {
            npy_ushort k = KEY_OF(arr[i]);
            npy_intp dst = cnt[cols[l]][nth_byte_short(k, cols[l])]++;
            aux[dst] = arr[i];
        }

        temp = aux;
        aux = arr;
        arr = temp;
    }

    return arr;
}

NPY_NO_EXPORT int
radixsort_short(void *start, npy_intp num, void *NPY_UNUSED(varr))
{
    void *sorted;
    npy_ushort *aux;
    npy_ushort *arr = start;
    npy_ushort k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[0]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[i]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_ushort));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = radixsort0_short(start, aux, num);
    if (sorted != start) {
        memcpy(start, sorted, num * sizeof(npy_ushort));
    }

    free(aux);
    return 0;
}

static npy_intp*
aradixsort0_short(npy_ushort *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ushort)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ushort key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ushort)];

    for (i = 0; i < num; i++) {
        npy_ushort k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ushort); l++) {
            cnt[l][nth_byte_short(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ushort); l++) {
        if (cnt[l][nth_byte_short(key0, l)] != num) {
            cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp* temp;
        for (i = 0; i < num; i++) {
            npy_ushort k = KEY_OF(arr[tosort[i]]);
            npy_intp dst = cnt[cols[l]][nth_byte_short(k, cols[l])]++;
            aux[dst] = tosort[i];
        }

        temp = aux;
        aux = tosort;
        tosort = temp;
    }

    return tosort;
}

NPY_NO_EXPORT int
aradixsort_short(void *start, npy_intp* tosort, npy_intp num, void *NPY_UNUSED(varr))
{
    npy_intp *sorted;
    npy_intp *aux;
    npy_ushort *arr = start;
    npy_ushort k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[tosort[0]]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[tosort[i]]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_intp));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = aradixsort0_short(start, aux, tosort, num);
    if (sorted != tosort) {
        memcpy(tosort, sorted, num * sizeof(npy_intp));
    }

    free(aux);
    return 0;
}

#undef KEY_OF


#line 25

// Reference: https://github.com/eloj/radix-sorting#-key-derivation
#if 0
    // Floating-point is currently disabled.
    // Floating-point tests succeed for double and float on macOS but not on Windows/Linux.
    // Basic sorting tests succeed but others relying on sort fail.
    // Possibly related to floating-point normalisation or multiple NaN reprs? Not sure.
    #if 0
        // For floats, we invert the key if the sign bit is set, else we invert the sign bit.
        #define KEY_OF(x) ((x) ^ (-((x) >> (sizeof(npy_ushort) * 8 - 1)) | ((npy_ushort)1 << (sizeof(npy_ushort) * 8 - 1))))
    #else
        // For signed ints, we flip the sign bit so the negatives are below the positives.
        #define KEY_OF(x) ((x) ^ ((npy_ushort)1 << (sizeof(npy_ushort) * 8 - 1)))
    #endif
#else
    // For unsigned ints, the key is as-is
    #define KEY_OF(x) (x)
#endif

static inline npy_ubyte
nth_byte_ushort(npy_ushort key, npy_intp l) {
    return (key >> (l << 3)) & 0xFF;
}

static npy_ushort*
radixsort0_ushort(npy_ushort *arr, npy_ushort *aux, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ushort)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ushort key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ushort)];

    for (i = 0; i < num; i++) {
        npy_ushort k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ushort); l++) {
            cnt[l][nth_byte_ushort(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ushort); l++) {
	    if (cnt[l][nth_byte_ushort(key0, l)] != num) {
	        cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_ushort* temp;
        for (i = 0; i < num; i++) {
            npy_ushort k = KEY_OF(arr[i]);
            npy_intp dst = cnt[cols[l]][nth_byte_ushort(k, cols[l])]++;
            aux[dst] = arr[i];
        }

        temp = aux;
        aux = arr;
        arr = temp;
    }

    return arr;
}

NPY_NO_EXPORT int
radixsort_ushort(void *start, npy_intp num, void *NPY_UNUSED(varr))
{
    void *sorted;
    npy_ushort *aux;
    npy_ushort *arr = start;
    npy_ushort k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[0]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[i]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_ushort));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = radixsort0_ushort(start, aux, num);
    if (sorted != start) {
        memcpy(start, sorted, num * sizeof(npy_ushort));
    }

    free(aux);
    return 0;
}

static npy_intp*
aradixsort0_ushort(npy_ushort *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ushort)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ushort key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ushort)];

    for (i = 0; i < num; i++) {
        npy_ushort k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ushort); l++) {
            cnt[l][nth_byte_ushort(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ushort); l++) {
        if (cnt[l][nth_byte_ushort(key0, l)] != num) {
            cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp* temp;
        for (i = 0; i < num; i++) {
            npy_ushort k = KEY_OF(arr[tosort[i]]);
            npy_intp dst = cnt[cols[l]][nth_byte_ushort(k, cols[l])]++;
            aux[dst] = tosort[i];
        }

        temp = aux;
        aux = tosort;
        tosort = temp;
    }

    return tosort;
}

NPY_NO_EXPORT int
aradixsort_ushort(void *start, npy_intp* tosort, npy_intp num, void *NPY_UNUSED(varr))
{
    npy_intp *sorted;
    npy_intp *aux;
    npy_ushort *arr = start;
    npy_ushort k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[tosort[0]]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[tosort[i]]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_intp));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = aradixsort0_ushort(start, aux, tosort, num);
    if (sorted != tosort) {
        memcpy(tosort, sorted, num * sizeof(npy_intp));
    }

    free(aux);
    return 0;
}

#undef KEY_OF


#line 25

// Reference: https://github.com/eloj/radix-sorting#-key-derivation
#if 1
    // Floating-point is currently disabled.
    // Floating-point tests succeed for double and float on macOS but not on Windows/Linux.
    // Basic sorting tests succeed but others relying on sort fail.
    // Possibly related to floating-point normalisation or multiple NaN reprs? Not sure.
    #if 0
        // For floats, we invert the key if the sign bit is set, else we invert the sign bit.
        #define KEY_OF(x) ((x) ^ (-((x) >> (sizeof(npy_uint) * 8 - 1)) | ((npy_uint)1 << (sizeof(npy_uint) * 8 - 1))))
    #else
        // For signed ints, we flip the sign bit so the negatives are below the positives.
        #define KEY_OF(x) ((x) ^ ((npy_uint)1 << (sizeof(npy_uint) * 8 - 1)))
    #endif
#else
    // For unsigned ints, the key is as-is
    #define KEY_OF(x) (x)
#endif

static inline npy_ubyte
nth_byte_int(npy_uint key, npy_intp l) {
    return (key >> (l << 3)) & 0xFF;
}

static npy_uint*
radixsort0_int(npy_uint *arr, npy_uint *aux, npy_intp num)
{
    npy_intp cnt[sizeof(npy_uint)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_uint key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_uint)];

    for (i = 0; i < num; i++) {
        npy_uint k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_uint); l++) {
            cnt[l][nth_byte_int(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_uint); l++) {
	    if (cnt[l][nth_byte_int(key0, l)] != num) {
	        cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_uint* temp;
        for (i = 0; i < num; i++) {
            npy_uint k = KEY_OF(arr[i]);
            npy_intp dst = cnt[cols[l]][nth_byte_int(k, cols[l])]++;
            aux[dst] = arr[i];
        }

        temp = aux;
        aux = arr;
        arr = temp;
    }

    return arr;
}

NPY_NO_EXPORT int
radixsort_int(void *start, npy_intp num, void *NPY_UNUSED(varr))
{
    void *sorted;
    npy_uint *aux;
    npy_uint *arr = start;
    npy_uint k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[0]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[i]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_uint));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = radixsort0_int(start, aux, num);
    if (sorted != start) {
        memcpy(start, sorted, num * sizeof(npy_uint));
    }

    free(aux);
    return 0;
}

static npy_intp*
aradixsort0_int(npy_uint *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
{
    npy_intp cnt[sizeof(npy_uint)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_uint key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_uint)];

    for (i = 0; i < num; i++) {
        npy_uint k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_uint); l++) {
            cnt[l][nth_byte_int(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_uint); l++) {
        if (cnt[l][nth_byte_int(key0, l)] != num) {
            cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp* temp;
        for (i = 0; i < num; i++) {
            npy_uint k = KEY_OF(arr[tosort[i]]);
            npy_intp dst = cnt[cols[l]][nth_byte_int(k, cols[l])]++;
            aux[dst] = tosort[i];
        }

        temp = aux;
        aux = tosort;
        tosort = temp;
    }

    return tosort;
}

NPY_NO_EXPORT int
aradixsort_int(void *start, npy_intp* tosort, npy_intp num, void *NPY_UNUSED(varr))
{
    npy_intp *sorted;
    npy_intp *aux;
    npy_uint *arr = start;
    npy_uint k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[tosort[0]]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[tosort[i]]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_intp));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = aradixsort0_int(start, aux, tosort, num);
    if (sorted != tosort) {
        memcpy(tosort, sorted, num * sizeof(npy_intp));
    }

    free(aux);
    return 0;
}

#undef KEY_OF


#line 25

// Reference: https://github.com/eloj/radix-sorting#-key-derivation
#if 0
    // Floating-point is currently disabled.
    // Floating-point tests succeed for double and float on macOS but not on Windows/Linux.
    // Basic sorting tests succeed but others relying on sort fail.
    // Possibly related to floating-point normalisation or multiple NaN reprs? Not sure.
    #if 0
        // For floats, we invert the key if the sign bit is set, else we invert the sign bit.
        #define KEY_OF(x) ((x) ^ (-((x) >> (sizeof(npy_uint) * 8 - 1)) | ((npy_uint)1 << (sizeof(npy_uint) * 8 - 1))))
    #else
        // For signed ints, we flip the sign bit so the negatives are below the positives.
        #define KEY_OF(x) ((x) ^ ((npy_uint)1 << (sizeof(npy_uint) * 8 - 1)))
    #endif
#else
    // For unsigned ints, the key is as-is
    #define KEY_OF(x) (x)
#endif

static inline npy_ubyte
nth_byte_uint(npy_uint key, npy_intp l) {
    return (key >> (l << 3)) & 0xFF;
}

static npy_uint*
radixsort0_uint(npy_uint *arr, npy_uint *aux, npy_intp num)
{
    npy_intp cnt[sizeof(npy_uint)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_uint key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_uint)];

    for (i = 0; i < num; i++) {
        npy_uint k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_uint); l++) {
            cnt[l][nth_byte_uint(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_uint); l++) {
	    if (cnt[l][nth_byte_uint(key0, l)] != num) {
	        cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_uint* temp;
        for (i = 0; i < num; i++) {
            npy_uint k = KEY_OF(arr[i]);
            npy_intp dst = cnt[cols[l]][nth_byte_uint(k, cols[l])]++;
            aux[dst] = arr[i];
        }

        temp = aux;
        aux = arr;
        arr = temp;
    }

    return arr;
}

NPY_NO_EXPORT int
radixsort_uint(void *start, npy_intp num, void *NPY_UNUSED(varr))
{
    void *sorted;
    npy_uint *aux;
    npy_uint *arr = start;
    npy_uint k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[0]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[i]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_uint));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = radixsort0_uint(start, aux, num);
    if (sorted != start) {
        memcpy(start, sorted, num * sizeof(npy_uint));
    }

    free(aux);
    return 0;
}

static npy_intp*
aradixsort0_uint(npy_uint *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
{
    npy_intp cnt[sizeof(npy_uint)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_uint key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_uint)];

    for (i = 0; i < num; i++) {
        npy_uint k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_uint); l++) {
            cnt[l][nth_byte_uint(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_uint); l++) {
        if (cnt[l][nth_byte_uint(key0, l)] != num) {
            cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp* temp;
        for (i = 0; i < num; i++) {
            npy_uint k = KEY_OF(arr[tosort[i]]);
            npy_intp dst = cnt[cols[l]][nth_byte_uint(k, cols[l])]++;
            aux[dst] = tosort[i];
        }

        temp = aux;
        aux = tosort;
        tosort = temp;
    }

    return tosort;
}

NPY_NO_EXPORT int
aradixsort_uint(void *start, npy_intp* tosort, npy_intp num, void *NPY_UNUSED(varr))
{
    npy_intp *sorted;
    npy_intp *aux;
    npy_uint *arr = start;
    npy_uint k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[tosort[0]]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[tosort[i]]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_intp));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = aradixsort0_uint(start, aux, tosort, num);
    if (sorted != tosort) {
        memcpy(tosort, sorted, num * sizeof(npy_intp));
    }

    free(aux);
    return 0;
}

#undef KEY_OF


#line 25

// Reference: https://github.com/eloj/radix-sorting#-key-derivation
#if 1
    // Floating-point is currently disabled.
    // Floating-point tests succeed for double and float on macOS but not on Windows/Linux.
    // Basic sorting tests succeed but others relying on sort fail.
    // Possibly related to floating-point normalisation or multiple NaN reprs? Not sure.
    #if 0
        // For floats, we invert the key if the sign bit is set, else we invert the sign bit.
        #define KEY_OF(x) ((x) ^ (-((x) >> (sizeof(npy_ulong) * 8 - 1)) | ((npy_ulong)1 << (sizeof(npy_ulong) * 8 - 1))))
    #else
        // For signed ints, we flip the sign bit so the negatives are below the positives.
        #define KEY_OF(x) ((x) ^ ((npy_ulong)1 << (sizeof(npy_ulong) * 8 - 1)))
    #endif
#else
    // For unsigned ints, the key is as-is
    #define KEY_OF(x) (x)
#endif

static inline npy_ubyte
nth_byte_long(npy_ulong key, npy_intp l) {
    return (key >> (l << 3)) & 0xFF;
}

static npy_ulong*
radixsort0_long(npy_ulong *arr, npy_ulong *aux, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ulong)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ulong key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ulong)];

    for (i = 0; i < num; i++) {
        npy_ulong k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ulong); l++) {
            cnt[l][nth_byte_long(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ulong); l++) {
	    if (cnt[l][nth_byte_long(key0, l)] != num) {
	        cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_ulong* temp;
        for (i = 0; i < num; i++) {
            npy_ulong k = KEY_OF(arr[i]);
            npy_intp dst = cnt[cols[l]][nth_byte_long(k, cols[l])]++;
            aux[dst] = arr[i];
        }

        temp = aux;
        aux = arr;
        arr = temp;
    }

    return arr;
}

NPY_NO_EXPORT int
radixsort_long(void *start, npy_intp num, void *NPY_UNUSED(varr))
{
    void *sorted;
    npy_ulong *aux;
    npy_ulong *arr = start;
    npy_ulong k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[0]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[i]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_ulong));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = radixsort0_long(start, aux, num);
    if (sorted != start) {
        memcpy(start, sorted, num * sizeof(npy_ulong));
    }

    free(aux);
    return 0;
}

static npy_intp*
aradixsort0_long(npy_ulong *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ulong)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ulong key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ulong)];

    for (i = 0; i < num; i++) {
        npy_ulong k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ulong); l++) {
            cnt[l][nth_byte_long(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ulong); l++) {
        if (cnt[l][nth_byte_long(key0, l)] != num) {
            cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp* temp;
        for (i = 0; i < num; i++) {
            npy_ulong k = KEY_OF(arr[tosort[i]]);
            npy_intp dst = cnt[cols[l]][nth_byte_long(k, cols[l])]++;
            aux[dst] = tosort[i];
        }

        temp = aux;
        aux = tosort;
        tosort = temp;
    }

    return tosort;
}

NPY_NO_EXPORT int
aradixsort_long(void *start, npy_intp* tosort, npy_intp num, void *NPY_UNUSED(varr))
{
    npy_intp *sorted;
    npy_intp *aux;
    npy_ulong *arr = start;
    npy_ulong k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[tosort[0]]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[tosort[i]]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_intp));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = aradixsort0_long(start, aux, tosort, num);
    if (sorted != tosort) {
        memcpy(tosort, sorted, num * sizeof(npy_intp));
    }

    free(aux);
    return 0;
}

#undef KEY_OF


#line 25

// Reference: https://github.com/eloj/radix-sorting#-key-derivation
#if 0
    // Floating-point is currently disabled.
    // Floating-point tests succeed for double and float on macOS but not on Windows/Linux.
    // Basic sorting tests succeed but others relying on sort fail.
    // Possibly related to floating-point normalisation or multiple NaN reprs? Not sure.
    #if 0
        // For floats, we invert the key if the sign bit is set, else we invert the sign bit.
        #define KEY_OF(x) ((x) ^ (-((x) >> (sizeof(npy_ulong) * 8 - 1)) | ((npy_ulong)1 << (sizeof(npy_ulong) * 8 - 1))))
    #else
        // For signed ints, we flip the sign bit so the negatives are below the positives.
        #define KEY_OF(x) ((x) ^ ((npy_ulong)1 << (sizeof(npy_ulong) * 8 - 1)))
    #endif
#else
    // For unsigned ints, the key is as-is
    #define KEY_OF(x) (x)
#endif

static inline npy_ubyte
nth_byte_ulong(npy_ulong key, npy_intp l) {
    return (key >> (l << 3)) & 0xFF;
}

static npy_ulong*
radixsort0_ulong(npy_ulong *arr, npy_ulong *aux, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ulong)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ulong key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ulong)];

    for (i = 0; i < num; i++) {
        npy_ulong k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ulong); l++) {
            cnt[l][nth_byte_ulong(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ulong); l++) {
	    if (cnt[l][nth_byte_ulong(key0, l)] != num) {
	        cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_ulong* temp;
        for (i = 0; i < num; i++) {
            npy_ulong k = KEY_OF(arr[i]);
            npy_intp dst = cnt[cols[l]][nth_byte_ulong(k, cols[l])]++;
            aux[dst] = arr[i];
        }

        temp = aux;
        aux = arr;
        arr = temp;
    }

    return arr;
}

NPY_NO_EXPORT int
radixsort_ulong(void *start, npy_intp num, void *NPY_UNUSED(varr))
{
    void *sorted;
    npy_ulong *aux;
    npy_ulong *arr = start;
    npy_ulong k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[0]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[i]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_ulong));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = radixsort0_ulong(start, aux, num);
    if (sorted != start) {
        memcpy(start, sorted, num * sizeof(npy_ulong));
    }

    free(aux);
    return 0;
}

static npy_intp*
aradixsort0_ulong(npy_ulong *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ulong)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ulong key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ulong)];

    for (i = 0; i < num; i++) {
        npy_ulong k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ulong); l++) {
            cnt[l][nth_byte_ulong(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ulong); l++) {
        if (cnt[l][nth_byte_ulong(key0, l)] != num) {
            cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp* temp;
        for (i = 0; i < num; i++) {
            npy_ulong k = KEY_OF(arr[tosort[i]]);
            npy_intp dst = cnt[cols[l]][nth_byte_ulong(k, cols[l])]++;
            aux[dst] = tosort[i];
        }

        temp = aux;
        aux = tosort;
        tosort = temp;
    }

    return tosort;
}

NPY_NO_EXPORT int
aradixsort_ulong(void *start, npy_intp* tosort, npy_intp num, void *NPY_UNUSED(varr))
{
    npy_intp *sorted;
    npy_intp *aux;
    npy_ulong *arr = start;
    npy_ulong k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[tosort[0]]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[tosort[i]]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_intp));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = aradixsort0_ulong(start, aux, tosort, num);
    if (sorted != tosort) {
        memcpy(tosort, sorted, num * sizeof(npy_intp));
    }

    free(aux);
    return 0;
}

#undef KEY_OF


#line 25

// Reference: https://github.com/eloj/radix-sorting#-key-derivation
#if 1
    // Floating-point is currently disabled.
    // Floating-point tests succeed for double and float on macOS but not on Windows/Linux.
    // Basic sorting tests succeed but others relying on sort fail.
    // Possibly related to floating-point normalisation or multiple NaN reprs? Not sure.
    #if 0
        // For floats, we invert the key if the sign bit is set, else we invert the sign bit.
        #define KEY_OF(x) ((x) ^ (-((x) >> (sizeof(npy_ulonglong) * 8 - 1)) | ((npy_ulonglong)1 << (sizeof(npy_ulonglong) * 8 - 1))))
    #else
        // For signed ints, we flip the sign bit so the negatives are below the positives.
        #define KEY_OF(x) ((x) ^ ((npy_ulonglong)1 << (sizeof(npy_ulonglong) * 8 - 1)))
    #endif
#else
    // For unsigned ints, the key is as-is
    #define KEY_OF(x) (x)
#endif

static inline npy_ubyte
nth_byte_longlong(npy_ulonglong key, npy_intp l) {
    return (key >> (l << 3)) & 0xFF;
}

static npy_ulonglong*
radixsort0_longlong(npy_ulonglong *arr, npy_ulonglong *aux, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ulonglong)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ulonglong key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ulonglong)];

    for (i = 0; i < num; i++) {
        npy_ulonglong k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ulonglong); l++) {
            cnt[l][nth_byte_longlong(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ulonglong); l++) {
	    if (cnt[l][nth_byte_longlong(key0, l)] != num) {
	        cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_ulonglong* temp;
        for (i = 0; i < num; i++) {
            npy_ulonglong k = KEY_OF(arr[i]);
            npy_intp dst = cnt[cols[l]][nth_byte_longlong(k, cols[l])]++;
            aux[dst] = arr[i];
        }

        temp = aux;
        aux = arr;
        arr = temp;
    }

    return arr;
}

NPY_NO_EXPORT int
radixsort_longlong(void *start, npy_intp num, void *NPY_UNUSED(varr))
{
    void *sorted;
    npy_ulonglong *aux;
    npy_ulonglong *arr = start;
    npy_ulonglong k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[0]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[i]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_ulonglong));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = radixsort0_longlong(start, aux, num);
    if (sorted != start) {
        memcpy(start, sorted, num * sizeof(npy_ulonglong));
    }

    free(aux);
    return 0;
}

static npy_intp*
aradixsort0_longlong(npy_ulonglong *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ulonglong)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ulonglong key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ulonglong)];

    for (i = 0; i < num; i++) {
        npy_ulonglong k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ulonglong); l++) {
            cnt[l][nth_byte_longlong(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ulonglong); l++) {
        if (cnt[l][nth_byte_longlong(key0, l)] != num) {
            cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp* temp;
        for (i = 0; i < num; i++) {
            npy_ulonglong k = KEY_OF(arr[tosort[i]]);
            npy_intp dst = cnt[cols[l]][nth_byte_longlong(k, cols[l])]++;
            aux[dst] = tosort[i];
        }

        temp = aux;
        aux = tosort;
        tosort = temp;
    }

    return tosort;
}

NPY_NO_EXPORT int
aradixsort_longlong(void *start, npy_intp* tosort, npy_intp num, void *NPY_UNUSED(varr))
{
    npy_intp *sorted;
    npy_intp *aux;
    npy_ulonglong *arr = start;
    npy_ulonglong k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[tosort[0]]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[tosort[i]]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_intp));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = aradixsort0_longlong(start, aux, tosort, num);
    if (sorted != tosort) {
        memcpy(tosort, sorted, num * sizeof(npy_intp));
    }

    free(aux);
    return 0;
}

#undef KEY_OF


#line 25

// Reference: https://github.com/eloj/radix-sorting#-key-derivation
#if 0
    // Floating-point is currently disabled.
    // Floating-point tests succeed for double and float on macOS but not on Windows/Linux.
    // Basic sorting tests succeed but others relying on sort fail.
    // Possibly related to floating-point normalisation or multiple NaN reprs? Not sure.
    #if 0
        // For floats, we invert the key if the sign bit is set, else we invert the sign bit.
        #define KEY_OF(x) ((x) ^ (-((x) >> (sizeof(npy_ulonglong) * 8 - 1)) | ((npy_ulonglong)1 << (sizeof(npy_ulonglong) * 8 - 1))))
    #else
        // For signed ints, we flip the sign bit so the negatives are below the positives.
        #define KEY_OF(x) ((x) ^ ((npy_ulonglong)1 << (sizeof(npy_ulonglong) * 8 - 1)))
    #endif
#else
    // For unsigned ints, the key is as-is
    #define KEY_OF(x) (x)
#endif

static inline npy_ubyte
nth_byte_ulonglong(npy_ulonglong key, npy_intp l) {
    return (key >> (l << 3)) & 0xFF;
}

static npy_ulonglong*
radixsort0_ulonglong(npy_ulonglong *arr, npy_ulonglong *aux, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ulonglong)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ulonglong key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ulonglong)];

    for (i = 0; i < num; i++) {
        npy_ulonglong k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ulonglong); l++) {
            cnt[l][nth_byte_ulonglong(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ulonglong); l++) {
	    if (cnt[l][nth_byte_ulonglong(key0, l)] != num) {
	        cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_ulonglong* temp;
        for (i = 0; i < num; i++) {
            npy_ulonglong k = KEY_OF(arr[i]);
            npy_intp dst = cnt[cols[l]][nth_byte_ulonglong(k, cols[l])]++;
            aux[dst] = arr[i];
        }

        temp = aux;
        aux = arr;
        arr = temp;
    }

    return arr;
}

NPY_NO_EXPORT int
radixsort_ulonglong(void *start, npy_intp num, void *NPY_UNUSED(varr))
{
    void *sorted;
    npy_ulonglong *aux;
    npy_ulonglong *arr = start;
    npy_ulonglong k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[0]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[i]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_ulonglong));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = radixsort0_ulonglong(start, aux, num);
    if (sorted != start) {
        memcpy(start, sorted, num * sizeof(npy_ulonglong));
    }

    free(aux);
    return 0;
}

static npy_intp*
aradixsort0_ulonglong(npy_ulonglong *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
{
    npy_intp cnt[sizeof(npy_ulonglong)][1 << 8] = { { 0 } };
    npy_intp i;
    size_t l;
    npy_ulonglong key0 = KEY_OF(arr[0]);
    size_t ncols = 0;
    npy_ubyte cols[sizeof(npy_ulonglong)];

    for (i = 0; i < num; i++) {
        npy_ulonglong k = KEY_OF(arr[i]);

        for (l = 0; l < sizeof(npy_ulonglong); l++) {
            cnt[l][nth_byte_ulonglong(k, l)]++;
        }
    }

    for (l = 0; l < sizeof(npy_ulonglong); l++) {
        if (cnt[l][nth_byte_ulonglong(key0, l)] != num) {
            cols[ncols++] = l;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp a = 0;
        for (i = 0; i < 256; i++) {
            npy_intp b = cnt[cols[l]][i];
            cnt[cols[l]][i] = a;
            a += b;
        }
    }

    for (l = 0; l < ncols; l++) {
        npy_intp* temp;
        for (i = 0; i < num; i++) {
            npy_ulonglong k = KEY_OF(arr[tosort[i]]);
            npy_intp dst = cnt[cols[l]][nth_byte_ulonglong(k, cols[l])]++;
            aux[dst] = tosort[i];
        }

        temp = aux;
        aux = tosort;
        tosort = temp;
    }

    return tosort;
}

NPY_NO_EXPORT int
aradixsort_ulonglong(void *start, npy_intp* tosort, npy_intp num, void *NPY_UNUSED(varr))
{
    npy_intp *sorted;
    npy_intp *aux;
    npy_ulonglong *arr = start;
    npy_ulonglong k1, k2;
    npy_bool all_sorted = 1;

    if (num < 2) {
        return 0;
    }

    k1 = KEY_OF(arr[tosort[0]]);
    for (npy_intp i = 1; i < num; i++) {
        k2 = KEY_OF(arr[tosort[i]]);
        if (k1 > k2) {
            all_sorted = 0;
            break;
        }
        k1 = k2;
    }

    if (all_sorted) {
        return 0;
    }

    aux = malloc(num * sizeof(npy_intp));
    if (aux == NULL) {
        return -NPY_ENOMEM;
    }

    sorted = aradixsort0_ulonglong(start, aux, tosort, num);
    if (sorted != tosort) {
        memcpy(tosort, sorted, num * sizeof(npy_intp));
    }

    free(aux);
    return 0;
}

#undef KEY_OF



