The key rule is "fork only when the forking overhead is much smaller than the amount of work the fork will do". Since forking overhead is a property of the specific technology you use, and so is the effort to do the work, you in some sense have to determine this empirically. You'll likely end up with some threshold tuning constant in your code to represent this tradeoff.
What you will discover in practice is that finding seperable chunks of work is actually hard. If you make the work chunk small, it hasn't got a lot of dependencies and you can schedule it once all its input dataflows are ready. But small chunks usually mean small work, and the forking overhead usually negates the gain. If you try to make the chunks big, they have so many dependences that you can't break them out to schedule them.
Some people are lucky and can find such big chunks; we call most of those people physicists and/or Fortran programmers and they are taking advantage of data parallelism induced by dividing the world into as many tiny pieces as they can.
The only decent cure I know of is to use a spectacularly fast forking mechanism, so that you can find the smallest practical chunks. Unfortunately, the parallelism libraries offered to do this are ... libraries, invoked dynamically, with corresponding dynamic invocation overhead. Typical libraries containing parallelism primitives takes 100s to thousands of cycles to implement a "fork"; this is bad news if your chunk of work is 100 machine instructions.
I believe strongly that to get such fast forking mechanisms, the language compiler has to know that you are doing the fork, e.g., "fork" (however spelled :-) has be a keyword in the language. Then the compiler can see the forks, and preallocate everything needed to minimize the time to accomplish this, and generate special code to manage the forking (and joining) steps.
The PARLANSE language that I designed, and that we use at Semantic Designs is one such language.
It is a Lisp-like language in syntax (but not in semantics). Its parallelism operator is spelled "(|| ... )". You can see it below in the Quicksort module we use daily, below.
You can also see the explicit QuickSortParallelThreshold value, determined empirically.
This Quicksort scales linearly to 8 cores on an Intel x86 system.
(define QuickSort
(module
(;; (define Value nu)
(compileifthen (~ (defined QuickSortWithParlanseBuiltInOrderingOfNu))
(define QuickSortWithParlanseBuiltInOrderingOfNu ~f) ; use PARLANSE comparison operators
)compileifthen
(compileifthen (~ (defined QuickSortParallelThreshold))
(define QuickSortParallelThreshold 100)
)compileifthen
(compileifthen (~ (defined QuickSortThreshold))
(compileifthenelse QuickSortWithParlanseBuiltInOrderingOfNu
(define QuickSortThreshold 16)
(define QuickSortThreshold 8)
)compileifthenelse
)compileifthen
(compileifthenelse (~ (defined QuickSortWithCompareByReference))
(define QuickSortWithCompareByReference ~f)
(compileifthen QuickSortWithParlanseBuiltInOrderingOfNu
(define QuickSortWithCompareByReference ~f)
)compileifthen
)compileifthenelse
(define SortRange
(action (procedure (structure (compileifthen (~ QuickSortWithParlanseBuiltInOrderingOfNu)
(compileifthenelse (~ QuickSortWithCompareByReference)
[compare (function (sort integer (range -1 +1)) (structure [value1 Value] [value2 Value]))]
[compare (function (sort integer (range -1 +1)) (structure [value1 (reference Value)] [value2 (reference Value)]))]
)compileifthenelse
)compileifthen
[a (reference (array Value 1 dynamic))]
[from natural]
[to natural]
)structure
)procedure
(local (;; (define quicksort
(action (procedure (structure [l integer] [r integer])))
)define
(define quicksort
(action (procedure (structure [l integer] [r integer]))
(ifthenelse (<= (- r l) (coerce integer QuickSortThreshold))
(do [i integer] (++ l) r +1
(local (= [exch Value] a:i)
(block exit_if_inserted
(;; (do [j integer] (-- i) l -1
(ifthenelse (compileifthenelse QuickSortWithParlanseBuiltInOrderingOfNu
(> a:j exch)
(compileifthenelse (~ QuickSortWithCompareByReference)
(== (compare a:j exch) +1)
(== (compare (. a:j) (. exch)) +1)
)compileifthenelse
)compileifthenelse
(= a:(++ j) a:j)
(;; (= a:(++ j) exch)
(exitblock exit_if_inserted)
);;
)ifthenelse
)do
(= a:l exch)
);;
)block
)local
)do
(local (;; (= [i integer] l)
(= [j integer] r)
(= [p integer] l)
(= [q integer] r)
[exch Value]
);;
(;;
`use middle element as pivot':
(local (= [m integer] (// (+ l r) +2))
(;; (= exch a:m)
(= a:m a:r)
(= a:r exch)
);;
)local
`4-way partitioning = < > =':
(loop exit_if_partitioned
(;;
`find element greater than pivot':
(loop exit_if_greater_than_found
(;; (compileifthenelse QuickSortWithParlanseBuiltInOrderingOfNu
(ifthenelse (< a:i a:r)
(consume ~t)
(ifthenelse (> a:i a:r)
(exitblock exit_if_greater_than_found)
(;; (ifthen (>= i j)
(exitblock exit_if_partitioned)
)ifthen
(= exch a:p)
(= a:p a:i)
(= a:i exch)
(+= p 1)
);;
)ifthenelse
)ifthenelse
(case (compileifthenelse (~ QuickSortWithCompareByReference)
(compare a:i a:r)
(compare (. a:i) (. a:r))
)compileifthenelse
-1
(consume ~t)
+1
(exitblock exit_if_greater_than_found)
else (;; (ifthen (>= i j)
(exitblock exit_if_partitioned)
)ifthen
(= exch a:p)
(= a:p a:i)
(= a:i exch)
(+= p 1)
);;
)case
)compileifthenelse
(+= i 1)
);;
)loop
`find element less than to pivot':
(loop exit_if_less_than_found
(;; (-= j 1)
(ifthen (>= i j)
(exitblock exit_if_partitioned)
)ifthen
(compileifthenelse QuickSortWithParlanseBuiltInOrderingOfNu
(ifthenelse (< a:j a:r)
(exitblock exit_if_less_than_found)
(ifthenelse (> a:j a:r)
(consume ~t)
(;; (-= q 1)
(= exch a:j)
(= a:j a:q)
(= a:q exch)
);;
)ifthenelse
)ifthenelse
(case (compileifthenelse (~ QuickSortWithCompareByReference)
(compare a:j a:r)
(compare (. a:j) (. a:r))
)compileifthenelse
-1
(exitblock exit_if_less_than_found)
+1
(consume ~t)
else (;; (-= q 1)
(= exch a:j)
(= a:j a:q)
(= a:q exch)
);;
)case
)compileifthenelse
);;
)loop
`move found elements to proper partitions':
(;; (= exch a:i)
(= a:i a:j)
(= a:j exch)
);;
`increment index':
(+= i 1)
);;
)loop
`3-way partitioning < = >':
(;;
`move pivot to final location':
(;; (= exch a:i)
(= a:i a:r)
(= a:r exch)
(= j (-- i))
(= i (++ i))
);;
`move elements equal to pivot to final locations':
(;; (do [k integer] l (-- p) +1
(;; (= exch a:k)
(= a:k a:j)
(= a:j exch)
(-= j 1)
);;
)do
(do [k integer] (-- r) q -1
(;; (= exch a:i)
(= a:i a:k)
(= a:k exch)
(+= i 1)
);;
)do
);;
);;
`sort partitions not equal to pivot':
(ifthenelse (<= (- r l) (coerce integer QuickSortParallelThreshold))
(;; (quicksort l j)
(quicksort i r)
);;
(|| (quicksort l j)
(quicksort i r)
)||
)ifthenelse
);;
)local
)ifthenelse
)action
)define
);;
(;; (quicksort (coerce integer from) (coerce integer to))
(ifdebug (do [i integer] (coerce integer from) (-- (coerce integer to)) +1
(trust (compileifthenelse QuickSortWithParlanseBuiltInOrderingOfNu
(<= a:i a:(++ i))
(compileifthenelse (~ QuickSortWithCompareByReference)
(<= (compare a:i a:(++ i)) +0)
(<= (compare (. a:i) (. a:(++ i))) +0)
)compileifthenelse
)compileifthenelse
`QuickSort:Sort -> The array is not sorted.'
)trust
)do
)ifdebug
);;
)local
)action
)define
(define Sort
(action (procedure (structure (compileifthen (~ QuickSortWithParlanseBuiltInOrderingOfNu)
(compileifthenelse (~ QuickSortWithCompareByReference)
[compare (function (sort integer (range -1 +1)) (structure [value1 Value] [value2 Value]))]
[compare (function (sort integer (range -1 +1)) (structure [value1 (reference Value)] [value2 (reference Value)]))]
)compileifthenelse
)compileifthen
[a (reference (array Value 1 dynamic))]
)structure
)procedure
(compileifthenelse (~ QuickSortWithParlanseBuiltInOrderingOfNu)
(SortRange compare a (coerce natural (lowerbound (@ a) 1)) (coerce natural (upperbound (@ a) 1)))
(SortRange a (coerce natural (lowerbound (@ a) 1)) (coerce natural (upperbound (@ a) 1)))
)compileifthenelse
)action
)define
);;
)module
)define