@@@ Quicksort of an array of signed integers, in ARM unified assembly.
@@@ Shooting for minimum source code size (24 lines of code) and
@@@ minimum instruction count (19 on Thumb2, 17 in regular ARM) rather
@@@ than maximum efficiency or minimum executable size.  Seems to
@@@ work.  However, it will take quadratic time on a giant array of
@@@ all zeroes.
        .syntax unified
        .thumb
        .cpu cortex-m4
        @.cpu cortex-a53

@@@ Array of 4-byte signed integers is in memory at [r0, r1] (both
@@@ bounds inclusive).  Standard EABI calling convention: r0-r3 are
@@@ arguments or temps, r4-r8 are callee-saved.
        .thumb_func
        .globl quirks
quirks: push {r4, r5, r6, lr}
        mov r5, r1  @ Move upper bound to r5 for callee-saving. Frees up r1.
        @@ r2 is a normally-negative number used as a loop counter in
        @@ the partition loop: the offset from r5, the inclusive upper
        @@ bound, of the currently examined element.  We initialize it
        @@ here to point to the first element.  If that’s nonnegative,
        @@ we have 0 items or 1 item, so we can bail out.  This also
        @@ serves as a tail-call entry point.
    4:  subs r2, r0, r5         @ Calculate length of array, minus 4.
        it cs                   @ (only needed on Thumb)
        popcs {r4, r5, r6, pc}  @ Return if carry set (no borrow).

        @@ Inline partition function, partitioning [r4] to [r5]
        @@ inclusive using Lomuto’s simpler partitioning algorithm.
        @@ We’re going to use [r5 + r2] to count from r4 (start,
        @@ inclusive) to r5 (end, inclusive).  The idea of the weird
        @@ r2 thing is that we can save an instruction by using a
        @@ negative offset below r5 so the loop termination test is
        @@ simpler.

        @@ Note that we preserve r0 through the partitioning loop to
        @@ pass to the first recursive call.

        @@ r6 is the partition position (the index at which to place
        @@ the next element that is ≤ the partition).  Like our loop
        @@ counter, it starts at the start of the array.
        mov r6, r0
        @@ The pivot element is the last element, at [r5 - 4].  On the
        @@ last iteration, it too gets swapped into the low partition,
        @@ because it is less than or equal to itself, and r6 is left
        @@ pointing one element after it.  We save it in r3, which
        @@ saves cycles and costs Thumb bytes, but doesn’t change the
        @@ source code size.
        ldr r3, [r5]
        @@ We can safely check our loop counter against the bound at
        @@ the bottom of the loop because we know we have at least
        @@ one element, so the loop must run at least once.
    1:  ldr r4, [r5, r2]
        cmp r4, r3         @ Is element at r2 ≤ pivot?
        ittt le            @ If so, swap elements at r2 and r6:
        ldrle r1, [r6]     @ Load previously first element in > side,
        strle r4, [r6], #4 @ and postincrement r6,
        strle r1, [r5, r2] @ and save previous element at end of > side.
        adds r2, #4        @ Increment offset.  Reached the end (>0)?
        ble 1b   @ Loop if ≤ 0 (either N && !V && !C, or !N && V && C)

        @@ Now r6 points one word after the pivot element, so we must
        @@ recursively sort [r0, r6-8] and [r6, r5].  Neither range
        @@ includes the pivot element at [r6 - 4], guaranteeing
        @@ termination.  Implicitly pass r0 as the same lower bound.
        subs r1, r6, #8
        bl quirks               @ Recursively sort left partition.

        mov r0, r6
        @@ Implicitly pass r5 as the same upper bound.
        b 4b                  @ Tail-recursively sort right partition.