Sorting small blocks (<5 elements) with a loopless algorithm may improve performance. I found only one example how to write a loopless sorting algorithm for 5 elements: http://wiki.tcl.tk/4118
The idea can be used to generate loopless sorting algorithms for 2,3,4,5 elements in C.
EDIT: i tried it on one set of data, and i measured 87% running time compared to the code in the question. Using insertion sort on <20 blocks resulted 92% on the same data set. This measurement is not representative but may show that this is a way how can You improve Your quicksort code.
EDIT: this example code write out loopless sorting functions for 2-6 elements:
#include <stdio.h>
#define OUT(i,fmt,...) printf("%*.s"fmt"\n",i,"",##__VA_ARGS__);
#define IF( a,b, i, block1, block2 ) \
OUT(i,"if(t[%i]>t[%i]){",a,b) block1 OUT(i,"}else{") block2 OUT(i,"}")
#define AB2(i,a,b) IF(a,b,i,P2(i+2,b,a),P2(i+2,a,b))
#define P2(i,a,b) print_perm(i,2,(int[2]){a,b});
#define AB3(i,a,b,c) IF(a,b,i,BC3(i+2,b,a,c),BC3(i+2,a,b,c))
#define AC3(i,a,b,c) IF(a,c,i, P3(i+2,c,a,b), P3(i+2,a,c,b))
#define BC3(i,a,b,c) IF(b,c,i,AC3(i+2,a,b,c), P3(i+2,a,b,c))
#define P3(i,a,b,c) print_perm(i,3,(int[3]){a,b,c});
#define AB4(i,a,b,c,d) IF(a,b,i,CD4(i+2,b,a,c,d),CD4(i+2,a,b,c,d))
#define AC4(i,a,b,c,d) IF(a,c,i, P4(i+2,c,a,b,d), P4(i+2,a,c,b,d))
#define BC4(i,a,b,c,d) IF(b,c,i,AC4(i+2,a,b,c,d), P4(i+2,a,b,c,d))
#define BD4(i,a,b,c,d) IF(b,d,i,BC4(i+2,c,d,a,b),BC4(i+2,a,b,c,d))
#define CD4(i,a,b,c,d) IF(c,d,i,BD4(i+2,a,b,d,c),BD4(i+2,a,b,c,d))
#define P4(i,a,b,c,d) print_perm(i,4,(int[4]){a,b,c,d});
#define AB5(i,a,b,c,d,e) IF(a,b,i,CD5(i+2,b,a,c,d,e),CD5(i+2,a,b,c,d,e))
#define AC5(i,a,b,c,d,e) IF(a,c,i, P5(i+2,c,a,b,d,e), P5(i+2,a,c,b,d,e))
#define AE5(i,a,b,c,d,e) IF(a,e,i,CB5(i+2,e,a,c,b,d),CB5(i+2,a,e,c,b,d))
#define BE5(i,a,b,c,d,e) IF(b,e,i,AE5(i+2,a,b,c,d,e),DE5(i+2,a,b,c,d,e))
#define BD5(i,a,b,c,d,e) IF(b,d,i,BE5(i+2,c,d,a,b,e),BE5(i+2,a,b,c,d,e))
#define CB5(i,a,b,c,d,e) IF(c,b,i,DC5(i+2,a,b,c,d,e),AC5(i+2,a,b,c,d,e))
#define CD5(i,a,b,c,d,e) IF(c,d,i,BD5(i+2,a,b,d,c,e),BD5(i+2,a,b,c,d,e))
#define DC5(i,a,b,c,d,e) IF(d,c,i, P5(i+2,a,b,c,d,e), P5(i+2,a,b,d,c,e))
#define DE5(i,a,b,c,d,e) IF(d,e,i,CB5(i+2,a,b,c,e,d),CB5(i+2,a,b,c,d,e))
#define P5(i,a,b,c,d,e) print_perm(i,5,(int[5]){a,b,c,d,e});
#define AB6(i,a,b,c,d,e,f) IF(a,b,i,CD6(i+2,b,a,c,d,e,f),CD6(i+2,a,b,c,d,e,f))
#define AC6(i,a,b,c,d,e,f) IF(a,c,i, P6(i+2,c,a,b,d,e,f), P6(i+2,a,c,b,d,e,f))
#define AE6(i,a,b,c,d,e,f) IF(a,e,i,CB6(i+2,e,a,c,b,d,f),CB6(i+2,a,e,c,b,d,f))
#define BD6(i,a,b,c,d,e,f) IF(b,d,i,DF6(i+2,c,d,a,b,e,f),DF6(i+2,a,b,c,d,e,f))
#define BE6(i,a,b,c,d,e,f) IF(b,e,i,AE6(i+2,a,b,c,d,e,f),DE6(i+2,a,b,c,d,e,f))
#define CB6(i,a,b,c,d,e,f) IF(c,b,i,DC6(i+2,a,b,c,d,e,f),AC6(i+2,a,b,c,d,e,f))
#define CD6(i,a,b,c,d,e,f) IF(c,d,i,EF6(i+2,a,b,d,c,e,f),EF6(i+2,a,b,c,d,e,f))
#define DB6(i,a,b,c,d,e,f) IF(d,b,i,BE6(i+2,a,b,c,d,e,f),BE6(i+2,c,d,a,b,e,f))
#define DC6(i,a,b,c,d,e,f) IF(d,c,i, P6(i+2,a,b,c,d,e,f), P6(i+2,a,b,d,c,e,f))
#define DE6(i,a,b,c,d,e,f) IF(d,e,i,CB6(i+2,a,b,c,e,d,f),CB6(i+2,a,b,c,d,e,f))
#define DF6(i,a,b,c,d,e,f) IF(d,f,i,DB6(i+2,a,b,e,f,c,d),BE6(i+2,a,b,c,d,e,f))
#define EF6(i,a,b,c,d,e,f) IF(e,f,i,BD6(i+2,a,b,c,d,f,e),BD6(i+2,a,b,c,d,e,f))
#define P6(i,a,b,c,d,e,f) print_perm(i,6,(int[6]){a,b,c,d,e,f});
#define SORT(ABn,n,...) \
OUT( 0, ""); \
OUT( 0, "inline void sort" #n "( int t[" #n "] ){" ) \
OUT( 2, "int tmp;" ) \
ABn( 2, __VA_ARGS__ ) \
OUT( 0, "}" )
void print_perm( int ind, int n, int t[n] ){
printf("%*.s", ind-1, "");
for( int i=0; i<n; i++ )
if( i != t[i] ){
printf(" tmp=t[%i]; t[%i]=",i,i);
for( int j=t[i],k; j!=i; k=j,j=t[j],t[k]=k)
printf("t[%i]; t[%i]=",j,j);
printf("tmp;");
}
printf("\n");
}
int main( void ){
SORT( AB2, 2, 0,1 )
SORT( AB3, 3, 0,1,2 )
SORT( AB4, 4, 0,1,2,3 )
SORT( AB5, 5, 0,1,2,3,4 )
SORT( AB6, 6, 0,1,2,3,4,5 )
}
The generated code 3718 lines long:
sort2(): 8
sort3(): 24
sort4(): 96
sort5(): 512
sort6(): 3072