From 22263709eda9f7d692a0f484fd759f757418dbd7 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Wed, 27 Apr 2011 13:27:04 -0400 Subject: [PATCH] replace heap sort with smoothsort implementation by Valentin Ochs Smoothsort is an adaptive variant of heapsort. This version was written by Valentin Ochs (apo) specifically for inclusion in musl. I worked with him to get it working in O(1) memory usage even with giant array element widths, and to optimize it heavily for size and speed. It's still roughly 4 times as large as the old heap sort implementation, but roughly 20 times faster given an almost-sorted array of 1M elements (20 being the base-2 log of 1M), i.e. it really does reduce O(n log n) to O(n) in the mostly-sorted case. It's still somewhat slower than glibc's Introsort for random input, but now considerably faster than glibc when the input is already sorted, or mostly sorted. --- COPYRIGHT | 4 + src/stdlib/qsort.c | 225 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 197 insertions(+), 32 deletions(-) diff --git a/COPYRIGHT b/COPYRIGHT index 92d8992e..b4b60d10 100644 --- a/COPYRIGHT +++ b/COPYRIGHT @@ -19,6 +19,10 @@ The implementation of DES for crypt (src/misc/crypt.c) is Copyright © 1994 David Burren. It is licensed under a BSD license compatible with the GNU LGPL. +The smoothsort implementation (src/stdlib/qsort.c) is Copyright © 2011 +Valentin Ochs and is licensed under an MIT-style license compatible +with the GNU LGPL. + The x86_64 port was written by Nicholas J. Kain. See individual files for their copyright status. diff --git a/src/stdlib/qsort.c b/src/stdlib/qsort.c index 91a3361d..866af0ec 100644 --- a/src/stdlib/qsort.c +++ b/src/stdlib/qsort.c @@ -1,50 +1,211 @@ +/* Copyright (C) 2011 by Valentin Ochs + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* Minor changes by Rich Felker for integration in musl, 2011-04-27. */ + +#include #include #include -/* A simple heap sort implementation.. only in-place O(nlogn) sort I know. */ +#include "atomic.h" +#define ntz(x) a_ctz_l((x)) -#define MIN(a, b) ((a)<(b) ? (a) : (b)) +typedef int (*cmpfun)(const void *, const void *); + +static inline int pntz(size_t p[2]) { + int r = ntz(p[0] - 1); + if(r != 0 || (r = 8*sizeof(size_t) + ntz(p[1])) != 8*sizeof(size_t)) { + return r; + } + return 0; +} -static void swap(char *a, char *b, size_t len) +static void cycle(size_t width, unsigned char* ar[], int n) { - char tmp[256]; + unsigned char tmp[256]; size_t l; - while (len) { - l = MIN(sizeof tmp, len); - memcpy(tmp, a, l); - memcpy(a, b, l); - memcpy(b, tmp, l); - a += l; - b += l; - len -= l; + int i; + + if(n < 2) { + return; + } + + ar[n] = tmp; + while(width) { + l = sizeof(tmp) < width ? sizeof(tmp) : width; + memcpy(ar[n], ar[0], l); + for(i = 0; i < n; i++) { + memcpy(ar[i], ar[i + 1], l); + ar[i] += l; + } + width -= l; + } +} + +/* shl() and shr() need n > 0 */ +static inline void shl(size_t p[2], int n) +{ + if(n >= 8 * sizeof(size_t)) { + n -= 8 * sizeof(size_t); + p[1] = p[0]; + p[0] = 0; + } + p[1] <<= n; + p[1] |= p[0] >> (sizeof(size_t) * 8 - n); + p[0] <<= n; +} + +static inline void shr(size_t p[2], int n) +{ + if(n >= 8 * sizeof(size_t)) { + n -= 8 * sizeof(size_t); + p[0] = p[1]; + p[1] = 0; } + p[0] >>= n; + p[0] |= p[1] << (sizeof(size_t) * 8 - n); + p[1] >>= n; } -static void sift(char *base, size_t root, size_t nel, size_t width, int (*cmp)(const void *, const void *)) +static void sift(unsigned char *head, size_t width, cmpfun cmp, int pshift, size_t lp[]) { - size_t max; + unsigned char *rt, *lf; + unsigned char *ar[14 * sizeof(size_t) + 1]; + int i = 1; + + ar[0] = head; + while(pshift > 1) { + rt = head - width; + lf = head - width - lp[pshift - 2]; - while (2*root <= nel) { - max = 2*root; - if (max < nel && cmp(base+max*width, base+(max+1)*width) < 0) - max++; - if (max && cmp(base+root*width, base+max*width) < 0) { - swap(base+root*width, base+max*width, width); - root = max; - } else break; + if((*cmp)(ar[0], lf) >= 0 && (*cmp)(ar[0], rt) >= 0) { + break; + } + if((*cmp)(lf, rt) >= 0) { + ar[i++] = lf; + head = lf; + pshift -= 1; + } else { + ar[i++] = rt; + head = rt; + pshift -= 2; + } } + cycle(width, ar, i); } -void qsort(void *_base, size_t nel, size_t width, int (*cmp)(const void *, const void *)) +static void trinkle(unsigned char *head, size_t width, cmpfun cmp, size_t pp[2], int pshift, int trusty, size_t lp[]) { - char *base = _base; - size_t i; - - if (!nel) return; - for (i=(nel+1)/2; i; i--) - sift(base, i-1, nel-1, width, cmp); - for (i=nel-1; i; i--) { - swap(base, base+i*width, width); - sift(base, 0, i-1, width, cmp); + unsigned char *stepson, + *rt, *lf; + size_t p[2]; + unsigned char *ar[14 * sizeof(size_t) + 1]; + int i = 1; + int trail; + + p[0] = pp[0]; + p[1] = pp[1]; + + ar[0] = head; + while(p[0] != 1 || p[1] != 0) { + stepson = head - lp[pshift]; + if((*cmp)(stepson, ar[0]) <= 0) { + break; + } + if(!trusty && pshift > 1) { + rt = head - width; + lf = head - width - lp[pshift - 2]; + if((*cmp)(rt, stepson) >= 0 || (*cmp)(lf, stepson) >= 0) { + break; + } + } + + ar[i++] = stepson; + head = stepson; + trail = pntz(p); + shr(p, trail); + pshift += trail; + trusty = 0; + } + if(!trusty) { + cycle(width, ar, i); + sift(head, width, cmp, pshift, lp); + } +} + +void qsort(void *base, size_t nel, size_t width, cmpfun cmp) +{ + size_t lp[12*sizeof(size_t)]; + size_t i, size = width * nel; + unsigned char *head = base, + *high = head + size - width; + size_t p[2] = {1, 0}; + int pshift = 1; + int trail; + + /* Precompute Leonardo numbers, scaled by element width */ + for(lp[0]=lp[1]=width, i=2; (lp[i]=lp[i-2]+lp[i-1]+width) < size; i++); + + while(head < high) { + if((p[0] & 3) == 3) { + sift(head, width, cmp, pshift, lp); + shr(p, 2); + pshift += 2; + } else { + if(lp[pshift - 1] >= high - head) { + trinkle(head, width, cmp, p, pshift, 0, lp); + } else { + sift(head, width, cmp, pshift, lp); + } + + if(pshift == 1) { + shl(p, 1); + pshift = 0; + } else { + shl(p, pshift - 1); + pshift = 1; + } + } + + p[0] |= 1; + head += width; + } + + trinkle(head, width, cmp, p, pshift, 0, lp); + + while(pshift != 1 || p[0] != 1 || p[1] != 0) { + if(pshift <= 1) { + trail = pntz(p); + shr(p, trail); + pshift += trail; + } else { + shl(p, 2); + pshift -= 2; + p[0] ^= 7; + shr(p, 1); + trinkle(head - lp[pshift] - width, width, cmp, p, pshift + 1, 1, lp); + shl(p, 1); + p[0] |= 1; + trinkle(head - width, width, cmp, p, pshift, 1, lp); + } + head -= width; } } -- 2.25.1