Optimizing Python program in Go and Odin

2026-03-08

Here we write optimized version of a Python reference program in both Go an Odin using mostly similar optimization techniques.

Define the problem

The idea here is to write a highly-optimized program in Go and Odin (to the extend possible using similar optimization techniques) counting number of papers for arXiv paper authors (given a CSV dataset paperscape-data, up to 2017) sorting by number of papers and name and displaying for each author number of papers and author name. The implementations are direct (i.e., not using some CSV library):

Python reference implementation

import glob
import sys
from collections import defaultdict

authors = defaultdict(int)

for fn in sorted(glob.glob("paperscape-data/*.csv")):
    sys.stderr.write('%s\n' % fn)
    with open(fn) as f:
        for line in f:
            if not line.startswith('#'):
                au = line.split(';', 6)[5]
                if len(au) != 0:
                    for a in au.split(','):
                        authors[a] += 1

for a, v in sorted(authors.items(), key=lambda kv: (-kv[1], kv[0])):
    print(v, a)

Go and Odin optimized implementations

Here we discuss optimizations used in Go and Odin implementations but also some similarities and differences between the two languages.

Imports

Odin and Go both use directory as a package concept, in both languages program start with declaration of package name although in Odin main package of a program does not have to be named main as it is in Go. Also structure of standard library is to some extend similar in Odin to the one in Go so most of the packages imported in both programs has the same names, although Odin has additional concept of package collections (base, core, and vendor). The collection is specified as prefix of imported package (before the colon).

Go

package main

import (
	"bufio"
	"bytes"
	"cmp"
	"fmt"
	"log"
	"os"
	"slices"
	"strings"
)

Odin

package main

import "core:bufio"
import "core:bytes"
import "core:fmt"
import "core:io"
import "core:os"
import "core:slice"
import "core:sort"
import "core:strings"

Process input files

As opposed to the Python reference implementation both Go and Odin programs takes the CSV files as program arguments. Function naming convention in Go is camel case while in Odin it is snake case. While Go is a garbage collected language, in Odin memory is manually managed (with easy use of multiple allocators which we will use below). The defer statement in Odin can be used to free the allocated memory at the end of scope (note that Odin defer has block scope while Go defer has function scope). This loop has only few iterations and thus the code does not need any optimizations.

Go

	m := make(map[string]int)
	for _, a := range os.Args[1:] {
		fmt.Fprintln(os.Stderr, a)
		if err := processFile(m, a); err != nil {
			log.Fatal(err)
		}
	}

Odin

	m := make(map[string]int)
	defer delete(m)
	for a in os.args[1:] {
		fmt.eprintln(a)
		if err := process_file(&m, a); err != nil {
			fmt.eprintln("error in file:", a, err)
			os.exit(1)
		}
	}

Process files line by line

Both Go and Odin has bufio package with Scanner type which we use with the buffer of 128kB, large enough to hold the longest line (avoid reallocation, at least in Go). For each line we process the line adding authors to the map and return scanner error (if any). In Odin errors are concrete types and to be able to return an error which either comes from os or bufio package we use a union with shared nil (i.e., nil value of any of member types of the union becomes the nil value of the union: fixes problem similar to go interface nil versus inner type nil).

Go

func processFile(m map[string]int, filename string) error {
	f, err := os.Open(filename)
	if err != nil {
		return err
	}
	defer f.Close()
	sc := bufio.NewScanner(f)
	var buf [128 * 1024]byte
	sc.Buffer(buf[:], len(buf))
	for sc.Scan() {
		processLine(m, sc.Bytes())
	}
	return sc.Err()
}

Odin

Error :: union #shared_nil {
	os.Error,
	bufio.Scanner_Error,
}

process_file :: proc(m: ^map[string]int, filepath: string) -> Error {
	f := os.open(filepath) or_return
	defer os.close(f)
	s: bufio.Scanner
	buffer: [128 * 1024]byte
	bufio.scanner_init_with_buffer(&s, os.stream_from_handle(f), buffer[:])
	defer bufio.scanner_destroy(&s)
	for bufio.scanner_scan(&s) {
		process_line(m, bufio.scanner_bytes(&s))
	}
	return bufio.scanner_error(&s)
}

Process line

Both in Go and Odin we use split functions returning iterators instead of the alternatives returning slices (optimization). In both Go and Odin it is better for the outer split to operate on bytes and the inner on strings: in Go it is connected with avoiding allocation (bytes to string conversion in Go almost always mean allocation, may be avoided when compiler can prove it is temporary conversion and not stored for later use), in Odin it is unclear why this is the case (as there is no allocation connected with bytes to string conversion).

In Odin we allocate only strings that are actually inserted as new keys to the map (as opposed to incrementing the value for already existing key: such keys are not allocated), this can be fast thanks to obtaining map reference value which we can increment, or when !ok we clone the new key using the temporary allocator (which is faster than the default one but only allows to free all allocations, not individual ones, and so its speed comes from less bookkeeping and being thread local, i.e., no mutex lock). The usage of temp allocator is only used in Odin implementation and makes the Odin version faster than Go version.

Go

var (
	comment   = []byte{'#'}
	semicolon = []byte{';'}
)

func processLine(m map[string]int, line []byte) {
	if bytes.HasPrefix(line, comment) {
		return
	}
	i := 0
	for b := range bytes.SplitSeq(line, semicolon) {
		if i == 5 {
			for a := range strings.SplitSeq(string(b), ",") {
				if len(a) > 0 {
					m[a] += 1
				}
			}
			break
		}
		i++
	}
}

Odin

semicolon :: []byte{';'}

process_line :: proc(m: ^map[string]int, line: []u8) {
	if len(line) == 0 || line[0] == '#' do return
	i := 0
	line := line
	for token in bytes.split_iterator(&line, semicolon) {
		if i == 5 {
			authors := string(token)
			for author in strings.split_by_byte_iterator(&authors, ',') {
				if len(author) > 0 {
					value, ok := &m[author]
					if ok {
						value^ += 1
					} else {
						m[strings.clone(author, allocator=context.temp_allocator)] = 1
					}
				}
			}
			break
		}
		i += 1
	}
}

Sort results

After counting papers in the map we collect them as an array and sort them according to first the number of papers (descending) and then the author name (ascending). In both Go and Odin we use size of the map to allocate array/slice of the proper size. In Odin we use sort.quick_sort_proc to make the sorting faster (it is in this context faster then slice.sort_by_cmp).

Go

type Author struct {
	count int
	name  string
}
	authors := make([]Author, 0, len(m))
	for k, v := range m {
		authors = append(authors, Author{k, v})
	}
	slices.SortFunc(authors, compare)
func compare(a Author, b Author) int {
	if a.count > b.count {
		return -1
	} else if a.count < b.count {
		return 1
	} else {
		return cmp.Compare(a.name, b.name)
	}
}

Odin

Author :: struct {
	count: int,
	name: string,
}
	authors := make([dynamic]Author, 0, len(m))
	defer delete(authors)
	for k, v in m {
		append(&authors, Author{v, k})
	}
	sort.quick_sort_proc(authors[:], compare)
compare :: proc(a, b: Author) -> int {
	return a.count < b.count ? 1 : a.count > b.count ? -1 : sort.compare_strings(a.name, b.name)
}

Output results

Both Go and Odin has buffered Writer in bufio package we use it to optimize displaying final output which is huge (7MB), i.e., to big to send line by line to the operating system. In Odin there is important trick: set flush to false as otherwise fmt.wprintln calls flush on a writer after writing the line so the buffer becomes useless without setting flush to false.

	w := bufio.NewWriter(os.Stdout)
	for _, a := range authors {
		if _, err := fmt.Fprintf(w, "%d %s\n", a.count, a.name); err != nil {
			log.Fatal(err)
		}
	}
	if err := w.Flush(); err != nil {
		log.Fatal(err)
	}

Odin

	b : bufio.Writer
	buf: [4096]u8
	bufio.writer_init_with_buf(&b, os.stream_from_handle(os.stdout), buf[:])
	defer bufio.writer_destroy(&b)
	w := bufio.writer_to_stream(&b)
	for author in authors {
		fmt.wprintln(w, author.count, author.name, flush=false)
	}
	bufio.writer_flush(&b)

Program execution comparison

The comparison below is based on one hundred executions on a machine running Linux with AMD Ryzen 7 5700X 8-Core Processor CPU.

The Python interpreter used was in version 3.14.3, the Go compiler used was in version 1.26.0, and the Odin compiler version was dev-2026-02.

The difference between Python reference implementation and both Go an Odin implementations for most below comparisons is huge so the comments will usually compare Go to Odin.

Elapsed (wall clock) time (seconds)

In this experiment our Odin program is 14% faster than our Go program (wall clock). Go program is extremely fast if you take into account that it is a garbage collected language the speed difference obtained in this experiment is due to using temporary allocator for authors names in Odin, before this optimization their execution times was comparable having all the other optimizations.

Name Average Stddev Min Max
Python 2.748 0.021 2.720 2.850
Go 0.580 0.006 0.570 0.600
Odin 0.499 0.003 0.490 0.510
Elapsed (wall clock) time plot

User time (seconds)

Although Odin program is only 14% faster in wall clock then Go program, though the Go program uses to some extend another core to run garbage collection thus the user time of Odin program is 27% smaller then Go program.

Name Average Stddev Min Max
Python 2.650 0.022 2.610 2.760
Go 0.641 0.014 0.620 0.680
Odin 0.468 0.006 0.460 0.480
User time plot

System time (seconds)

The system time used by Odin program is (on average) 11% smaller than the Go program in this experiment but comparing it to std dev and same min and max in this experiment for both languages it is hard to say more than: Odin program uses slightly less system time on average.

Name Average Stddev Min Max
Python 0.079 0.009 0.060 0.100
Go 0.027 0.007 0.010 0.040
Odin 0.024 0.006 0.010 0.040
System time plot

Percent of CPU this job got

The percent of CPU used by the program is practically full use (99%) of single core by both Python and Odin only Go uses more than single core (115%) which must be used to perform some work of garbage collection on another thread.

Name Average Stddev Min Max
Python 99.000 0.000 99 99
Go 115.440 1.538 113 119
Odin 99.000 0.000 99 99
Percent of CPU plot

Maximum resident set size (bytes)

Python reference program used 39% more memory than Go program which uses 39% more memory than Odin program. The Odin program when populating the map with authors’ papers counts allocates only the keys needed while Go allocates the key for each author in the file and the one not needed are then garbage collected and also Go has huge runtime (garbage collector and green treads, i.e., goroutines, support) compared to minimal in the Odin program.

Name Average Stddev Min Max
Python 147187 132.272 146816 147488
Go 90320 1576.638 86024 92764
Odin 54954 132.009 54588 55264
Maximum resident set size plot

Full source code for optimized implementations

Here are Go and Odin implementations of the reference Python program.

Go implementation

package main

import (
	"bufio"
	"bytes"
	"cmp"
	"fmt"
	"log"
	"os"
	"slices"
	"strings"
)

type Author struct {
	count int
	name  string
}

func main() {
	m := make(map[string]int)
	for _, a := range os.Args[1:] {
		fmt.Fprintln(os.Stderr, a)
		if err := processFile(m, a); err != nil {
			log.Fatal(err)
		}
	}

	authors := make([]Author, 0, len(m))
	for k, v := range m {
		authors = append(authors, Author{v, k})
	}
	slices.SortFunc(authors, compare)

	w := bufio.NewWriter(os.Stdout)
	for _, a := range authors {
		if _, err := fmt.Fprintf(w, "%d %s\n", a.count, a.name); err != nil {
			log.Fatal(err)
		}
	}
	if err := w.Flush(); err != nil {
		log.Fatal(err)
	}
}

func processFile(m map[string]int, filename string) error {
	f, err := os.Open(filename)
	if err != nil {
		return err
	}
	defer f.Close()
	sc := bufio.NewScanner(f)
	var buf [128 * 1024]byte
	sc.Buffer(buf[:], len(buf))
	for sc.Scan() {
		processLine(m, sc.Bytes())
	}
	return sc.Err()
}

var (
	comment   = []byte{'#'}
	semicolon = []byte{';'}
)

func processLine(m map[string]int, line []byte) {
	if bytes.HasPrefix(line, comment) {
		return
	}
	i := 0
	for b := range bytes.SplitSeq(line, semicolon) {
		if i == 5 {
			for a := range strings.SplitSeq(string(b), ",") {
				if len(a) > 0 {
					m[a] += 1
				}
			}
			break
		}
		i++
	}
}

func compare(a Author, b Author) int {
	if a.count > b.count {
		return -1
	} else if a.count < b.count {
		return 1
	} else {
		return cmp.Compare(a.name, b.name)
	}
}

Odin implementation

package main

import "core:bufio"
import "core:bytes"
import "core:fmt"
import "core:io"
import "core:os"
import "core:slice"
import "core:sort"
import "core:strings"

Author :: struct {
	count: int,
	name: string,
}

main :: proc()  {
	m := make(map[string]int)
	defer delete(m)
	for a in os.args[1:] {
		fmt.eprintln(a)
		if err := process_file(&m, a); err != nil {
			fmt.eprintln("error in file:", a, err)
			os.exit(1)
		}
	}

	authors := make([dynamic]Author, 0, len(m))
	defer delete(authors)
	for k, v in m {
		append(&authors, Author{v, k})
	}
	sort.quick_sort_proc(authors[:], compare)

	b : bufio.Writer
	buf: [4096]u8
	bufio.writer_init_with_buf(&b, os.stream_from_handle(os.stdout), buf[:])
	defer bufio.writer_destroy(&b)
	w := bufio.writer_to_stream(&b)
	for author in authors {
		fmt.wprintln(w, author.count, author.name, flush=false)
	}
	bufio.writer_flush(&b)

	free_all(context.temp_allocator);
}

Error :: union #shared_nil {
	os.Error,
	bufio.Scanner_Error,
}

process_file :: proc(m: ^map[string]int, filepath: string) -> Error {
	f := os.open(filepath) or_return
	defer os.close(f)
	s: bufio.Scanner
	buffer: [128 * 1024]byte
	bufio.scanner_init_with_buffer(&s, os.stream_from_handle(f), buffer[:])
	defer bufio.scanner_destroy(&s)
	for bufio.scanner_scan(&s) {
		process_line(m, bufio.scanner_bytes(&s))
	}
	return bufio.scanner_error(&s)
}

semicolon :: []byte{';'}

process_line :: proc(m: ^map[string]int, line: []u8) {
	if len(line) == 0 || line[0] == '#' do return
	i := 0
	line := line
	for token in bytes.split_iterator(&line, semicolon) {
		if i == 5 {
			authors := string(token)
			for author in strings.split_by_byte_iterator(&authors, ',') {
				if len(author) > 0 {
					value, ok := &m[author]
					if ok {
						value^ += 1
					} else {
						m[strings.clone(author, allocator=context.temp_allocator)] = 1
					}
				}
			}
			break
		}
		i += 1
	}
}

compare :: proc(a, b: Author) -> int {
	return a.count < b.count ? 1 : a.count > b.count ? -1 : sort.compare_strings(a.name, b.name)
}