1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-06-27 05:30:23 +02:00

Add documentation on tracepoints

Also clean up how-to-build documentation
This commit is contained in:
Andy Wingo 2025-02-14 12:30:40 +01:00
parent 81da950ebe
commit 367e04f164
4 changed files with 346 additions and 166 deletions

160
ctf_to_json.py Executable file
View file

@ -0,0 +1,160 @@
#!/usr/bin/env python3
# Any copyright is dedicated to the Public Domain.
# https://creativecommons.org/publicdomain/zero/1.0/
#
# Originally written by Andy Wingo <wingo@igalia.com>.
import bt2 # From the babeltrace2 package.
import sys
import json
from enum import Enum
# Usage: ./ctf_to_json.py ~/lttng-traces/name-of-your-trace > foo.json
#
# Convert a Common Trace Format (CTF) trace, for example as produced by
# LTTng, to the JSON-based Trace Event Format (TEF), for example as
# consumed by `chrome://tracing`, `https://ui.perfetto.dev/`, or
# `https://profiler.firefox.com`.
# The Trace Event Format is documented here:
#
# https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0
# By default, events are emitted as EventPhase.INSTANT. We also support
# rewriting the event stream so as to generate EventPhase.BEGIN /
# EventPhase.END events for specific named events.
synthetic_events = {
'gc': ['whippet:mutator_cause_gc',
'whippet:restarting_mutators'],
'stop-the-world': ['whippet:requesting_stop',
'whippet:mutators_stopped'],
'trace': ['whippet:prepare_gc',
'whippet:restarting_mutators'],
'mutator-stopped': ['whippet:mutator_stopping',
'whippet:mutator_restarted'],
'trace-roots': ['whippet:trace_roots_begin',
'whippet:trace_roots_end'],
'trace-check-termination': ['whippet:trace_check_termination_begin',
'whippet:trace_check_termination_end'],
'trace-objects': ['whippet:trace_objects_begin',
'whippet:trace_objects_end'],
'trace-worker': ['whippet:trace_worker_begin',
'whippet:trace_worker_end']
}
class EventPhase(Enum):
BEGIN = 'B'
END = 'E'
COMPLETE = 'X'
INSTANT = 'i'
COUNTER = 'C'
NESTABLE_START = 'b'
NESTABLE_INSTANT = 'n'
NESTABLE_END = 'e'
FLOW_START = 's'
FLOW_STEP = 't'
FLOW_END = 'f'
SAMPLE = 'P'
OBJECT_CREATED = 'N'
OBJECT_SNAPSHOT = 'O'
OBJECT_DESTROYED = 'D'
METADATA = 'M'
MEMORY_DUMP_GLOBAL = 'V'
MEMORY_DUMP_PROCESS = 'V'
MARK = 'R'
CLOCK_SYNC = 'c'
CONTEXT_BEGIN = '('
CONTEXT_END = ')'
base_time = None
def event_us(msg):
assert(msg.default_clock_snapshot.clock_class.name == 'monotonic')
assert(msg.default_clock_snapshot.clock_class.frequency == 1e9)
global base_time
ns = msg.default_clock_snapshot.value
if base_time is None:
base_time = ns
return (ns - base_time) * 1e-3
def lower(x):
if isinstance(x, str) or isinstance(x, int) or isinstance(x, float):
return x
if isinstance(x, dict) or isinstance(x, bt2._StructureFieldConst):
return {lower(k):lower(v) for k, v in x.items()}
if isinstance(x, bt2._BoolValueConst) or isinstance(x, bt2._BoolFieldConst):
return bool(x)
if isinstance(x, bt2._EnumerationFieldConst):
return repr(x)
if isinstance(x, bt2._IntegerValueConst) or isinstance(x, bt2._IntegerFieldConst):
return int(x)
if isinstance(x, bt2._RealValueConst) or isinstance(x, bt2._RealFieldConst):
return float(x)
if isinstance(x, bt2._StringValueConst) or isinstance(x, bt2._StringFieldConst):
return str(x)
raise ValueError("Unexpected value from trace", x)
# Specific Whippet events.
synthetic_begin = {}
synthetic_end = {}
for synthetic, [begin, end] in synthetic_events.items():
synthetic_begin[begin] = []
synthetic_end[end] = []
for synthetic, [begin, end] in synthetic_events.items():
synthetic_begin[begin].append(synthetic)
synthetic_end[end].append(synthetic)
def put(str):
sys.stdout.write(str)
need_comma = False
def print_event(ev):
global need_comma
if need_comma:
sys.stdout.write(',\n ')
else:
need_comma = True
# It appears to be faster to make a string, then print the string,
# than to call json.dump with a file object.
# json.dump(ev, sys.stdout, ensure_ascii=False, check_circular=False)
put(json.dumps(ev, ensure_ascii=False, check_circular=False))
def emit_event(msg, name, phase):
ev = {'name': name,
'cat': 'whippet',
'ph': phase.value,
'ts': event_us(msg),
'pid': lower(msg.event.common_context_field['vpid']),
'tid': lower(msg.event.common_context_field['vtid']),
'args': lower(msg.event.payload_field)}
print_event(ev)
def emit_begin_event(msg, name):
emit_event(msg, name, EventPhase.BEGIN)
def emit_end_event(msg, name):
emit_event(msg, name, EventPhase.END)
def emit_events(msg):
emit_event(msg, msg.event.name, EventPhase.INSTANT)
for begin in synthetic_begin.get(msg.event.name, []):
emit_begin_event(msg, begin)
for end in synthetic_end.get(msg.event.name, []):
emit_end_event(msg, end)
def ctf_to_json(path):
msg_it = bt2.TraceCollectionMessageIterator(path)
put('{\n')
put(' "traceEvents": [\n ')
for msg in msg_it:
if hasattr(msg, 'event'):
emit_events(msg)
put('\n')
put('\n ],\n')
put(' "displayTimeUnit": "ns"\n')
put('}\n')
if len(sys.argv) != 2:
sys.stderr.write(
'usage: ' + sys.argv[0] + ' ~/lttng-traces/name-of-your-trace\n')
sys.exit(1)
else:
ctf_to_json(sys.argv[1])

View file

@ -176,13 +176,14 @@ implementations of that API: `semi`, a simple semi-space collector;
collector; and `mmc`, a mostly-marking collector inspired by Immix.
The program that embeds Whippet selects the collector implementation at
build-time. In the case of the `mmc` collector, the program
also configures a specific collector mode, again at build-time:
generational or not, parallel or not, stack-conservative or not, and
heap-conservative or not. It may be nice in the future to be able to
configure these at run-time, but for the time being they are
compile-time options so that adding new features doesn't change the
footprint of a more minimal collector.
build-time. For `pcc`, the program can also choose whether to be
generational or not. For `mmc` collector, the program configures a
specific collector mode, again at build-time: generational or not,
parallel or not, stack-conservative or not, and heap-conservative or
not. It may be nice in the future to be able to configure these at
run-time, but for the time being they are compile-time options so that
adding new features doesn't change the footprint of a more minimal
collector.
Different collectors have different allocation strategies: for example,
the BDW collector allocates from thread-local freelists, whereas the
@ -199,97 +200,58 @@ compiling user code.
### Compiling the collector
Building the collector is not as easy as it should be. As an embed-only
library, we don't get to choose the One True Build System and then just
build the software in that way; instead Whippet needs to be buildable
with any build system. At some point we will have snippets that
embedders can include in their various build systems, but for now we
document the low-level structure, so that people can craft the
appropriate incantations for their program's build system.
As an embed-only library, Whippet needs to be integrated into the build
system of its host (embedder). Currently the only supported build
system uses GNU make. We would be happy to add other systems over time.
Whippet consists of some collector-implementation-agnostic independent
modules, and then the collector implementation itself. Though Whippet
tries to put performance-sensitive interfaces in header files, users
should also compile with link-time optimization (LTO) to remove any
overhead imposed by the division of code into separate compilation
units.
At a high level, first the embedder chooses a collector and defines how
to specialize the collector against the embedder. Whippet's `embed.mk`
Makefile snippet then defines how to build the set of object files that
define the collector, and how to specialize the embedder against the
chosen collector.
Usually you want to build with maximum optimization and no debugging
assertions. Sometimes you want minimal optimization and all assertions.
Here's what we do, as a `Makefile` snippet:
As an example, say you have a file `program.c`, and you want to compile
it against a Whippet checkout in `whippet/`. Your headers are in
`include/`, and you have written an implementation of the embedder
interface in `host-gc.h`. In that case you would have a Makefile like
this:
```
DEFAULT_BUILD=opt
BUILD_CFLAGS_opt=-O2 -g -DNDEBUG
BUILD_CFLAGS_optdebug=-Og -g -DGC_DEBUG=1
BUILD_CFLAGS_debug=-O0 -g -DGC_DEBUG=1
BUILD_CFLAGS=$(BUILD_CFLAGS_$(or $(BUILD),$(DEFAULT_BUILD)))
HOST_DIR:=$(dir $(lastword $(MAKEFILE_LIST)))
WHIPPET_DIR=$(HOST_DIR)whippet/
all: out
# The collector to choose: e.g. semi, bdw, pcc, generational-pcc, mmc,
# parallel-mmc, etc.
GC_COLLECTOR=pcc
include $(WHIPPET_DIR)embed.mk
# Host cflags go here...
HOST_CFLAGS=
# Whippet's embed.mk uses this variable when it compiles code that
# should be specialized against the embedder.
EMBEDDER_TO_GC_CFLAGS=$(HOST_CFLAGS) -include $(HOST_DIR)host-gc.h
program.o: program.c
$(GC_COMPILE) $(HOST_CFLAGS) $(GC_TO_EMBEDDER_CFLAGS) -c $<
program: program.o $(GC_OBJS)
$(GC_LINK) $^ $(GC_LIBS)
```
So if you do just plain `make`, it will do an `opt` build. You can
specify the build mode by setting `BUILD` on the command line, as in
`make BUILD=debug`.
The optimization settings passed to the C compiler are taken from
`GC_BUILD_CFLAGS`. Embedders can override this variable directly, or
via the shorthand `GC_BUILD` variable. A `GC_BUILD` of `opt` indicates
maximum optimization and no debugging assertions; `optdebug` adds
debugging assertions; and `debug` removes optimizations.
Then for the actual compilation flags, we do:
```
CC=gcc
CFLAGS=-Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(BUILD_CFLAGS)
INCLUDES=-I.
LDFLAGS=-lpthread -flto
COMPILE=$(CC) $(CFLAGS) $(INCLUDES)
```
The actual include directory (the dot in `-I.`) should be adjusted as
appropriate.
#### Collector-implementation-agnostic independent modules
There are currently four generic modules that don't depend on the choice
of collector. The first is `gc-stack.o`, which has supporting code to
associate mutators (threads) with slices of the native stack, in order
to support conservative root-finding.
```
$(COMPILE) -o gc-stack.o -c gc-stack.c
```
The next is a generic options interface, to allow the user to
parameterize the collector at run-time, for example to implement a
specific heap sizing strategy.
```
$(COMPILE) -o gc-options.o -c gc-options.c
```
Next, where Whippet needs to get data from the operating system, for
example the number of processors available, it does so behind an
abstract interface that is selected at compile-time. The only
implementation currently is for GNU/Linux, but it's a pretty thin layer,
so adding more systems should not be difficult.
```
PLATFORM=gnu-linux
$(COMPILE) -o gc-platform.o -c gc-platform-$(PLATFORM).c
```
Finally, something a little more complicated: ephemerons. Ephemerons
are objects that make a weak association between a key and a value. As
first-class objects, they need to be classifiable by the user system,
and notably via the `gc_trace_object` procedure, and therefore need to
have a header whose shape is understandable by the embedding program.
We do this by including the `gc-embedder-api.h` implementation, via
`-include`, in this case providing `foo-embedder.h`:
```
$(COMPILE) -include foo-embedder.h -o gc-ephemeron.o -c gc-ephemeron.c
```
As for ephemerons, finalizers also have their own compilation unit.
```
$(COMPILE) -include foo-embedder.h -o gc-finalizer.o -c gc-finalizer.c
```
Though Whippet tries to put performance-sensitive interfaces in header
files, users should also compile with link-time optimization (LTO) to
remove any overhead imposed by the division of code into separate
compilation units. `embed.mk` includes the necessary LTO flags in
`GC_CFLAGS` and `GC_LDFLAGS`.
#### Compile-time options
@ -316,82 +278,14 @@ Some collectors require specific compile-time options. For example, the
semi-space collector has to be able to move all objects; this is not
compatible with conservative roots or heap edges.
#### Building `semi`
#### Tracing support
Finally, let's build a collector. The simplest collector is the
semi-space collector. The entirety of the implementation can be had by
compiling `semi.c`, providing the program's embedder API implementation
via `-include`:
```
$(COMPILE) -DGC_PRECISE_ROOTS=1 -include foo-embedder.h -o gc.o -c semi.c
```
#### Building `bdw`
The next simplest collector uses
[BDW-GC](https://github.com/ivmai/bdwgc). This collector must scan the
roots and heap conservatively. The collector is parallel if BDW-GC
itself was compiled with parallelism enabled.
```
$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 \
`pkg-config --cflags bdw-gc` \
-include foo-embedder.h -o gc.o -c bdw.c
```
#### Building `pcc`
The parallel copying collector is like `semi` but better in every way:
it supports multiple mutator threads, and evacuates in parallel if
multiple threads are available.
```
$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 \
-include foo-embedder.h -o gc.o -c pcc.c
```
You can also build `pcc` in a generational configuration by passing
`-DGC_GENERATIONAL=1`. The nursery is 2 MB per active mutator, capped
to the number of processors, so if the last cycle had a maximum of 4
mutator threads active at the same time and your machine has 24 cores,
your nursery would be 8 MB.
#### Building `mmc`
Finally, there is the mostly-marking collector. It can collect roots
precisely or conservatively, trace precisely or conservatively, be
parallel or not, and be generational or not.
```
$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 \
-include foo-embedder.h -o gc.o -c mvv.c
```
### Compiling your program
Any compilation unit that uses the GC API should have the same set of
compile-time options defined as when compiling the collector.
Additionally those compilation units should include the "attributes"
header for the collector in question, namely `semi-attrs.h`,
`bdw-attrs.h`, `pcc-attrs.h`, or `mmc-attrs.h`. For example, for
parallel generational mmc, you might have:
```
$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 \
-include mmc-attrs.h -o my-program.o -c my-program.c
```
### Linking the collector into your program
Finally to link, pass all objects to the linker. You will want to
ensure that the linker enables `-flto`, for link-time optimization. We
do it like this:
```
$(CC) $(LDFLAGS) -o my-program \
my-program.o gc-stack.o gc-platform.o gc-options.o gc-ephemeron.o
```
Whippet includes support for low-overhead run-time tracing via
[LTTng](https://lttng.org/). If the support library `lttng-ust` is
present when Whippet is compiled (as checked via `pkg-config`),
tracepoint support will be present. See
[tracepoints.md](./tracepoints.md) for more information on how to get
performance traces out of Whippet.
## Using the collector

BIN
doc/perfetto-minor-gc.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 169 KiB

126
doc/tracepoints.md Normal file
View file

@ -0,0 +1,126 @@
# Whippet performance tracing
Whippet includes support for run-time tracing via
[LTTng](https://LTTng.org) user-space tracepoints. This allows you to
get a detailed look at how Whippet is performing on your system.
Tracing support is currently limited to Linux systems.
## Getting started
First, you need to build Whippet with LTTng support. Usually this is as
easy as building it in an environment where the `lttng-ust` library is
present, as determined by `pkg-config --libs lttng-ust`. You can know
if your Whippet has tracing support by seeing if the resulting binaries
are dynamically linked to `liblttng-ust`.
If we take as an example the `mt-gcbench` test in the Whippet source
tree, we would have:
```
$ ldd bin/mt-gcbench.pcc | grep lttng
...
liblttng-ust.so.1 => ...
...
```
### Capturing traces
Actually capturing traces is a little annoying; it's not as easy as
`perf run`. The [LTTng
documentation](https://lttng.org/docs/v2.13/#doc-controlling-tracing) is
quite thorough, but here is a summary.
First, create your tracing session:
```
$ lttng create
Session auto-20250214-091153 created.
Traces will be output to $HOME/lttng-traces/auto-20250214-091153
```
You run all these commands as your own user; they don't require root
permissions or system-wide modifications, as all of the Whippet
tracepoints are user-space tracepoints (UST).
Just having an LTTng session created won't do anything though; you need
to configure the session. Monotonic nanosecond-resolution timestamps
are already implicitly part of each event. We also want to have process
and thread IDs for all events:
```
$ lttng add-context --userspace --type=vpid --type=vtid
ust context vpid added to all channels
ust context vtid added to all channels
```
Now enable Whippet events:
```
$ lttng enable-event --userspace 'whippet:*'
ust event whippet:* created in channel channel0
```
And now, start recording:
```
$ lttng start
Tracing started for session auto-20250214-091153
```
With this, traces will be captured for our program of interest:
```
$ bin/mt-gcbench.pcc 2.5 8
...
```
Now stop the trace:
```
$ lttng stop
Waiting for data availability
Tracing stopped for session auto-20250214-091153
```
Whew. If we did it right, our data is now in
$HOME/lttng-traces/auto-20250214-091153.
### Visualizing traces
LTTng produces traces in the [Common Trace Format
(CTF)](https://diamon.org/ctf/). My favorite trace viewing tool is the
family of web-based trace viewers derived from `chrome://tracing`. The
best of these appear to be [the Firefox
profiler](https://profiler.firefox.com) and
[Perfetto](https://ui.perfetto.dev). Unfortunately neither of these can
work with CTF directly, so we instead need to run a trace converter.
Oddly, there is no trace converter that can read CTF and write something
that Perfetto (e.g.) can read. However there is a JSON-based tracing
format that Perfetto can read, and [Python bindings for Babeltrace, a
library that works with CTF](https://babeltrace.org/), so that's what we
do:
```
$ python3 ctf_to_json.py ~/lttng-traces/auto-20250214-091153 > trace.json
```
While Firefox Profiler can load this file, it works better on Perfetto,
as the Whippet events are visually rendered on their respective threads.
![Screenshot of part of Perfetto UI showing a minor GC](./perfetto-minor-gc.png)
### Expanding the set of events
As of February 2025,
the current set of tracepoints includes the [heap
events](https://github.com/wingo/whippet/blob/main/doc/manual.md#statistics)
and some detailed internals of the parallel tracer. We expect this set
of tracepoints to expand over time.
### Overhead of tracepoints
When tracepoints are compiled in but no events are enabled, tracepoints
appear to have no impact on run-time. When event collection is on, for
x86-64 hardware, [emitting a tracepoint event takes about
100ns](https://discuss.systems/@DesnoyersMa/113986344940256872).