Optimizing Python program in Go and Odin
Table of Contents
Here we write optimized version of a Python reference program in both Go an Odin using mostly similar optimization techniques.
Define the problem
The idea here is to write a highly-optimized program in Go and Odin (to the extend possible using similar optimization techniques) counting number of papers for arXiv paper authors (given a CSV dataset paperscape-data, up to 2017) sorting by number of papers and name and displaying for each author number of papers and author name. The implementations are direct (i.e., not using some CSV library):
- below, first, is Python reference implementation,
- we compare optimized implementations in Go and Odin,
- and then compare execution time and memory (for all three: Python, Go, and Odin),
- finally we provide full Go and Odin optimized implementations.
Python reference implementation
import glob
import sys
from collections import defaultdict
authors = defaultdict(int)
for fn in sorted(glob.glob("paperscape-data/*.csv")):
sys.stderr.write('%s\n' % fn)
with open(fn) as f:
for line in f:
if not line.startswith('#'):
au = line.split(';', 6)[5]
if len(au) != 0:
for a in au.split(','):
authors[a] += 1
for a, v in sorted(authors.items(), key=lambda kv: (-kv[1], kv[0])):
print(v, a)
Go and Odin optimized implementations
Here we discuss optimizations used in Go and Odin implementations but also some similarities and differences between the two languages.
Imports
Odin and Go both use directory as a package concept, in both languages
program start with declaration of package name although in Odin main package of
a program does not have to be named main as it is in Go. Also structure of
standard library is to some extend similar in Odin to the one in Go so most of
the packages imported in both programs has the same names, although Odin has
additional concept of package collections
(base,
core, and
vendor). The collection is specified as
prefix of imported package (before the colon).
Go
package main
import (
"bufio"
"bytes"
"cmp"
"fmt"
"log"
"os"
"slices"
"strings"
)
Odin
package main
import "core:bufio"
import "core:bytes"
import "core:fmt"
import "core:io"
import "core:os"
import "core:slice"
import "core:sort"
import "core:strings"
Process input files
As opposed to the Python reference implementation both Go and Odin programs
takes the CSV files as program arguments. Function naming convention in Go is
camel case while in Odin it is snake case. While Go is a garbage collected
language, in Odin memory is manually managed (with easy use of multiple
allocators which we will use below). The defer statement in Odin can be used to
free the allocated memory at the end of scope (note that Odin defer has block scope
while Go defer has function scope). This loop has only few iterations and
thus the code does not need any optimizations.
Go
m := make(map[string]int)
for _, a := range os.Args[1:] {
fmt.Fprintln(os.Stderr, a)
if err := processFile(m, a); err != nil {
log.Fatal(err)
}
}
Odin
m := make(map[string]int)
defer delete(m)
for a in os.args[1:] {
fmt.eprintln(a)
if err := process_file(&m, a); err != nil {
fmt.eprintln("error in file:", a, err)
os.exit(1)
}
}
Process files line by line
Both Go and Odin has bufio package with Scanner type
which we use with the buffer of 128kB, large enough to hold the longest line
(avoid reallocation, at least in Go). For each line we process the line adding
authors to the map and return scanner error (if any). In Odin errors are
concrete types and to be able to return an error which either comes from
os or
bufio package we use a union with shared nil (i.e., nil value of any of
member types of the union becomes the nil value of the union: fixes problem
similar to go interface nil versus inner type nil).
Go
func processFile(m map[string]int, filename string) error {
f, err := os.Open(filename)
if err != nil {
return err
}
defer f.Close()
sc := bufio.NewScanner(f)
var buf [128 * 1024]byte
sc.Buffer(buf[:], len(buf))
for sc.Scan() {
processLine(m, sc.Bytes())
}
return sc.Err()
}
Odin
Error :: union #shared_nil {
os.Error,
bufio.Scanner_Error,
}
process_file :: proc(m: ^map[string]int, filepath: string) -> Error {
f := os.open(filepath) or_return
defer os.close(f)
s: bufio.Scanner
buffer: [128 * 1024]byte
bufio.scanner_init_with_buffer(&s, os.stream_from_handle(f), buffer[:])
defer bufio.scanner_destroy(&s)
for bufio.scanner_scan(&s) {
process_line(m, bufio.scanner_bytes(&s))
}
return bufio.scanner_error(&s)
}
Process line
Both in Go and Odin we use split functions returning iterators instead of the alternatives returning slices (optimization). In both Go and Odin it is better for the outer split to operate on bytes and the inner on strings: in Go it is connected with avoiding allocation (bytes to string conversion in Go almost always mean allocation, may be avoided when compiler can prove it is temporary conversion and not stored for later use), in Odin it is unclear why this is the case (as there is no allocation connected with bytes to string conversion).
In Odin we allocate only strings that are actually inserted as new keys to the
map (as opposed to incrementing the value for already existing key: such keys
are not allocated), this can be fast thanks to obtaining map reference value
which we can increment, or when !ok we clone the new key using the temporary
allocator (which is faster than the default one but only allows to free all
allocations, not individual ones, and so its speed comes from less bookkeeping
and being thread local, i.e., no mutex lock). The usage of temp allocator is
only used in Odin implementation and makes the Odin version faster than Go
version.
Go
var (
comment = []byte{'#'}
semicolon = []byte{';'}
)
func processLine(m map[string]int, line []byte) {
if bytes.HasPrefix(line, comment) {
return
}
i := 0
for b := range bytes.SplitSeq(line, semicolon) {
if i == 5 {
for a := range strings.SplitSeq(string(b), ",") {
if len(a) > 0 {
m[a] += 1
}
}
break
}
i++
}
}
Odin
semicolon :: []byte{';'}
process_line :: proc(m: ^map[string]int, line: []u8) {
if len(line) == 0 || line[0] == '#' do return
i := 0
line := line
for token in bytes.split_iterator(&line, semicolon) {
if i == 5 {
authors := string(token)
for author in strings.split_by_byte_iterator(&authors, ',') {
if len(author) > 0 {
value, ok := &m[author]
if ok {
value^ += 1
} else {
m[strings.clone(author, allocator=context.temp_allocator)] = 1
}
}
}
break
}
i += 1
}
}
Sort results
After counting papers in the map we collect them as an array and sort them according to first the number of papers (descending) and then the author name (ascending). In both Go and Odin we use size of the map to allocate array/slice of the proper size. In Odin we use sort.quick_sort_proc to make the sorting faster (it is in this context faster then slice.sort_by_cmp).
Go
type Author struct {
count int
name string
}
authors := make([]Author, 0, len(m))
for k, v := range m {
authors = append(authors, Author{k, v})
}
slices.SortFunc(authors, compare)
func compare(a Author, b Author) int {
if a.count > b.count {
return -1
} else if a.count < b.count {
return 1
} else {
return cmp.Compare(a.name, b.name)
}
}
Odin
Author :: struct {
count: int,
name: string,
}
authors := make([dynamic]Author, 0, len(m))
defer delete(authors)
for k, v in m {
append(&authors, Author{v, k})
}
sort.quick_sort_proc(authors[:], compare)
compare :: proc(a, b: Author) -> int {
return a.count < b.count ? 1 : a.count > b.count ? -1 : sort.compare_strings(a.name, b.name)
}
Output results
Both Go and Odin has buffered Writer in bufio package we use it to
optimize displaying final output which is huge (7MB), i.e., to big to send
line by line to the operating system. In Odin there is important trick: set
flush to false as otherwise
fmt.wprintln
calls flush on a writer after writing the line
so the buffer becomes useless without setting flush to false.
w := bufio.NewWriter(os.Stdout)
for _, a := range authors {
if _, err := fmt.Fprintf(w, "%d %s\n", a.count, a.name); err != nil {
log.Fatal(err)
}
}
if err := w.Flush(); err != nil {
log.Fatal(err)
}
Odin
b : bufio.Writer
buf: [4096]u8
bufio.writer_init_with_buf(&b, os.stream_from_handle(os.stdout), buf[:])
defer bufio.writer_destroy(&b)
w := bufio.writer_to_stream(&b)
for author in authors {
fmt.wprintln(w, author.count, author.name, flush=false)
}
bufio.writer_flush(&b)
Program execution comparison
The comparison below is based on one hundred executions on a machine running Linux with AMD Ryzen 7 5700X 8-Core Processor CPU.
The Python interpreter used was in version 3.14.3, the Go compiler used was in version 1.26.0, and the Odin compiler version was dev-2026-02.
The difference between Python reference implementation and both Go an Odin implementations for most below comparisons is huge so the comments will usually compare Go to Odin.
Elapsed (wall clock) time (seconds)
In this experiment our Odin program is 14% faster than our Go program (wall clock). Go program is extremely fast if you take into account that it is a garbage collected language the speed difference obtained in this experiment is due to using temporary allocator for authors names in Odin, before this optimization their execution times was comparable having all the other optimizations.
| Name | Average | Stddev | Min | Max |
|---|---|---|---|---|
| Python | 2.748 | 0.021 | 2.720 | 2.850 |
| Go | 0.580 | 0.006 | 0.570 | 0.600 |
| Odin | 0.499 | 0.003 | 0.490 | 0.510 |
User time (seconds)
Although Odin program is only 14% faster in wall clock then Go program, though the Go program uses to some extend another core to run garbage collection thus the user time of Odin program is 27% smaller then Go program.
| Name | Average | Stddev | Min | Max |
|---|---|---|---|---|
| Python | 2.650 | 0.022 | 2.610 | 2.760 |
| Go | 0.641 | 0.014 | 0.620 | 0.680 |
| Odin | 0.468 | 0.006 | 0.460 | 0.480 |
System time (seconds)
The system time used by Odin program is (on average) 11% smaller than the Go program in this experiment but comparing it to std dev and same min and max in this experiment for both languages it is hard to say more than: Odin program uses slightly less system time on average.
| Name | Average | Stddev | Min | Max |
|---|---|---|---|---|
| Python | 0.079 | 0.009 | 0.060 | 0.100 |
| Go | 0.027 | 0.007 | 0.010 | 0.040 |
| Odin | 0.024 | 0.006 | 0.010 | 0.040 |
Percent of CPU this job got
The percent of CPU used by the program is practically full use (99%) of single core by both Python and Odin only Go uses more than single core (115%) which must be used to perform some work of garbage collection on another thread.
| Name | Average | Stddev | Min | Max |
|---|---|---|---|---|
| Python | 99.000 | 0.000 | 99 | 99 |
| Go | 115.440 | 1.538 | 113 | 119 |
| Odin | 99.000 | 0.000 | 99 | 99 |
Maximum resident set size (bytes)
Python reference program used 39% more memory than Go program which uses 39% more memory than Odin program. The Odin program when populating the map with authors’ papers counts allocates only the keys needed while Go allocates the key for each author in the file and the one not needed are then garbage collected and also Go has huge runtime (garbage collector and green treads, i.e., goroutines, support) compared to minimal in the Odin program.
| Name | Average | Stddev | Min | Max |
|---|---|---|---|---|
| Python | 147187 | 132.272 | 146816 | 147488 |
| Go | 90320 | 1576.638 | 86024 | 92764 |
| Odin | 54954 | 132.009 | 54588 | 55264 |
Full source code for optimized implementations
Here are Go and Odin implementations of the reference Python program.
Go implementation
package main
import (
"bufio"
"bytes"
"cmp"
"fmt"
"log"
"os"
"slices"
"strings"
)
type Author struct {
count int
name string
}
func main() {
m := make(map[string]int)
for _, a := range os.Args[1:] {
fmt.Fprintln(os.Stderr, a)
if err := processFile(m, a); err != nil {
log.Fatal(err)
}
}
authors := make([]Author, 0, len(m))
for k, v := range m {
authors = append(authors, Author{v, k})
}
slices.SortFunc(authors, compare)
w := bufio.NewWriter(os.Stdout)
for _, a := range authors {
if _, err := fmt.Fprintf(w, "%d %s\n", a.count, a.name); err != nil {
log.Fatal(err)
}
}
if err := w.Flush(); err != nil {
log.Fatal(err)
}
}
func processFile(m map[string]int, filename string) error {
f, err := os.Open(filename)
if err != nil {
return err
}
defer f.Close()
sc := bufio.NewScanner(f)
var buf [128 * 1024]byte
sc.Buffer(buf[:], len(buf))
for sc.Scan() {
processLine(m, sc.Bytes())
}
return sc.Err()
}
var (
comment = []byte{'#'}
semicolon = []byte{';'}
)
func processLine(m map[string]int, line []byte) {
if bytes.HasPrefix(line, comment) {
return
}
i := 0
for b := range bytes.SplitSeq(line, semicolon) {
if i == 5 {
for a := range strings.SplitSeq(string(b), ",") {
if len(a) > 0 {
m[a] += 1
}
}
break
}
i++
}
}
func compare(a Author, b Author) int {
if a.count > b.count {
return -1
} else if a.count < b.count {
return 1
} else {
return cmp.Compare(a.name, b.name)
}
}
Odin implementation
package main
import "core:bufio"
import "core:bytes"
import "core:fmt"
import "core:io"
import "core:os"
import "core:slice"
import "core:sort"
import "core:strings"
Author :: struct {
count: int,
name: string,
}
main :: proc() {
m := make(map[string]int)
defer delete(m)
for a in os.args[1:] {
fmt.eprintln(a)
if err := process_file(&m, a); err != nil {
fmt.eprintln("error in file:", a, err)
os.exit(1)
}
}
authors := make([dynamic]Author, 0, len(m))
defer delete(authors)
for k, v in m {
append(&authors, Author{v, k})
}
sort.quick_sort_proc(authors[:], compare)
b : bufio.Writer
buf: [4096]u8
bufio.writer_init_with_buf(&b, os.stream_from_handle(os.stdout), buf[:])
defer bufio.writer_destroy(&b)
w := bufio.writer_to_stream(&b)
for author in authors {
fmt.wprintln(w, author.count, author.name, flush=false)
}
bufio.writer_flush(&b)
free_all(context.temp_allocator);
}
Error :: union #shared_nil {
os.Error,
bufio.Scanner_Error,
}
process_file :: proc(m: ^map[string]int, filepath: string) -> Error {
f := os.open(filepath) or_return
defer os.close(f)
s: bufio.Scanner
buffer: [128 * 1024]byte
bufio.scanner_init_with_buffer(&s, os.stream_from_handle(f), buffer[:])
defer bufio.scanner_destroy(&s)
for bufio.scanner_scan(&s) {
process_line(m, bufio.scanner_bytes(&s))
}
return bufio.scanner_error(&s)
}
semicolon :: []byte{';'}
process_line :: proc(m: ^map[string]int, line: []u8) {
if len(line) == 0 || line[0] == '#' do return
i := 0
line := line
for token in bytes.split_iterator(&line, semicolon) {
if i == 5 {
authors := string(token)
for author in strings.split_by_byte_iterator(&authors, ',') {
if len(author) > 0 {
value, ok := &m[author]
if ok {
value^ += 1
} else {
m[strings.clone(author, allocator=context.temp_allocator)] = 1
}
}
}
break
}
i += 1
}
}
compare :: proc(a, b: Author) -> int {
return a.count < b.count ? 1 : a.count > b.count ? -1 : sort.compare_strings(a.name, b.name)
}