Sorting in Python

February 13, 2021

Continuing MIT’s Data Structures and Algorithms course today, with my Python code for insertion sort, merge sort, and heap sort.

In all three, I generated a list of n random integers between 0 and n, sorted the list, and then printed the first and last elements to make sure they were expected values, since printing the whole list would take forever with large values of n.

Merge sort came out on top, with heap taking ~50% longer, and insertion taking almost as long to sort 10,000 items as merge took to sort 1,000,000.

Insertion Sort

def main():
    # Create list of random integers
    n = 10000
    numList = [None] * n
    for i in range(n):
        numList[i] = random.randint(0, n)

    # Starting at the first list element ensures that element [j] is always being inserted into
    # a sorted list, [0: j-1].
    for j in range(1, n):
        value_being_sorted = numList[j]
        i = j - 1

    # Move every element greater than 'value_being_sorted' to the right. When we reach a number less
    # than 'value', we stop and place 'value' to the right of it. If we never find a number less than
    # 'value', then 'value' is our new first element.
        while numList[i] > value_being_sorted:
            numList[i+1] = numList[i]
            i = i - 1

        numList[i+1] = value_being_sorted

Merge Sort

def main():
    # Create list of random integers
    n = 1000000
    numList = [None] * n
    for i in range(n):
        numList[i] = random.randint(0, n)

    # mergeSort() and merge() are both passed lengths so that they don't need to find
    # it themselves with len(). len() is not a demanding function, but it would get
    # called a lot, resulting in large function call overhead.
    sortedList = mergeSort(numList, n)
    print(sortedList[0])
    print(sortedList[n-1])

def mergeSort(numList, n):
    # Split input list
    lenA = n//2
    lenB = n - n//2
    subListA = numList[0:n//2]
    subListB = numList[n//2:n]

    # Recurse if sublists can be split further
    if lenA > 1:
        subListA = mergeSort(subListA, lenA)
    if lenB > 1:
        subListB = mergeSort(subListB, lenB)

    # If sublists are sorted (and all single element lists are considered sorted), they are then merged
    output = merge(subListA, subListB, lenA, lenB)

    return output

def merge(subListA, subListB, lenA, lenB):
    # Three indices - one for each sublist, and one for the merged list
    i = 0
    j = 0
    k = 0

    # Initializing the merged output list is faster than creating an empty list and appending to it
    output = [None] * (lenA + lenB)

    # Compare values in each list, returning the larger, until one list is finished
    while i<lenA and j<lenB:
        if subListA[i] < subListB[j]:
            output[k] = subListA[i]
            i += 1
            k += 1
        else:
            output[k] = subListB[j]
            j += 1
            k += 1

    # Once one list is finished, the rest of the other list can be output without any value comparisons
    if i<lenA:
        output[k:] = subListA[i:]
    if j<lenB:
        output[k:] = subListB[j:]

    return output

Heap Sort

# Create list of random integers
def main():
    n = 1000000
    numList = [None] * n
    for i in range(n):
        numList[i] = random.randint(0, n)
    heapSort(numList, n)
    print(numList[0])
    print(numList[n-1])

def heapSort(numList, n):
    # heapSort() takes advantage of the O(1) time it takes to find the max in a max heap, 
    # so the numbers must first be turned into a max heap using heapify().
    heapify(numList, n)

    # The variable 'end' points to the last unsorted element in the list.
    # In each 'while' loop, the max (numList[0]) is swapped with the 'end' element,
    # which is then sifted to re-heapify the unsorted elements.
    end = n - 1
    while end > 0:
        numList[0], numList[end] = numList[end], numList[0]
        end = end - 1
        siftDown(numList, 0, end)

def heapify(numList, n):
    # heapify() starts at the middle of the list, which will always be the last parent 
    # element. Every element beyond this is a leaf, which cannot be sifted.
    start = n//2 - 1
    while start >= 0:
        siftDown(numList, start, n - 1)
        start = start - 1

def siftDown(numList, start, end):
    # The variable 'root' is the index of the parent of the current node group, and 
    # 'child' is the left child's index. Note that 2 * root + 1 returns the left child's
    # index in a zero-based array.
    root = start
    child = 2 * root + 1
    # If a child exists, 'swap' is used to index to the largest of the three nodes.
    # If that turns out to be the parent, then siftDown() is done. Otherwise, the 
    # parent is swapped with the larger child and the process repeats at the former parent's 
    # new location.
    while (child) <= end:
        swap = root
        if numList[swap] < numList[child]:
            swap = child
        if (child + 1) <= end and numList[swap] < numList[child + 1]:
            swap = child + 1

        if swap == root:
            return
        else:
            numList[root], numList[swap] = numList[swap], numList[root]
            root = swap
            child = 2 * root + 1

Lessons Learned

  • It can be worth it to avoid using the len() function if it’s going to be called a lot.
  • Initializing an array of [None] * x values and assigning a value for each index is faster than calling append() a bunch of times.
  • I need to learn to find more elegant ‘if’ statement solutions - my original attempt at the merge() function had some embarrassingly ugly branches.