Add documentation on tracepoints

Also clean up how-to-build documentation
2025-06-27 05:30:23 +02:00 · 2025-02-14 12:30:40 +01:00 · 2025-02-14 12:30:40 +01:00 · 367e04f164
commit 367e04f164
parent 81da950ebe
4 changed files with 346 additions and 166 deletions
--- a/ctf_to_json.py
+++ b/ctf_to_json.py
@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+# Any copyright is dedicated to the Public Domain.
+# https://creativecommons.org/publicdomain/zero/1.0/
+#
+# Originally written by Andy Wingo <wingo@igalia.com>.
+
+import bt2 # From the babeltrace2 package.
+import sys
+import json
+from enum import Enum
+
+# Usage: ./ctf_to_json.py ~/lttng-traces/name-of-your-trace > foo.json
+#
+# Convert a Common Trace Format (CTF) trace, for example as produced by
+# LTTng, to the JSON-based Trace Event Format (TEF), for example as
+# consumed by `chrome://tracing`, `https://ui.perfetto.dev/`, or
+# `https://profiler.firefox.com`.
+
+# The Trace Event Format is documented here:
+#
+# https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0
+
+# By default, events are emitted as EventPhase.INSTANT.  We also support
+# rewriting the event stream so as to generate EventPhase.BEGIN /
+# EventPhase.END events for specific named events.
+
+synthetic_events = {
+    'gc': ['whippet:mutator_cause_gc',
+           'whippet:restarting_mutators'],
+    'stop-the-world': ['whippet:requesting_stop',
+                       'whippet:mutators_stopped'],
+    'trace': ['whippet:prepare_gc',
+              'whippet:restarting_mutators'],
+    'mutator-stopped': ['whippet:mutator_stopping',
+                        'whippet:mutator_restarted'],
+    'trace-roots': ['whippet:trace_roots_begin',
+                    'whippet:trace_roots_end'],
+    'trace-check-termination': ['whippet:trace_check_termination_begin',
+                                'whippet:trace_check_termination_end'],
+    'trace-objects': ['whippet:trace_objects_begin',
+                      'whippet:trace_objects_end'],
+    'trace-worker': ['whippet:trace_worker_begin',
+                     'whippet:trace_worker_end']
+}
+
+class EventPhase(Enum):
+    BEGIN = 'B'
+    END = 'E'
+    COMPLETE = 'X'
+    INSTANT = 'i'
+    COUNTER = 'C'
+    NESTABLE_START = 'b'
+    NESTABLE_INSTANT = 'n'
+    NESTABLE_END = 'e'
+    FLOW_START = 's'
+    FLOW_STEP = 't'
+    FLOW_END = 'f'
+    SAMPLE = 'P'
+    OBJECT_CREATED = 'N'
+    OBJECT_SNAPSHOT = 'O'
+    OBJECT_DESTROYED = 'D'
+    METADATA = 'M'
+    MEMORY_DUMP_GLOBAL = 'V'
+    MEMORY_DUMP_PROCESS = 'V'
+    MARK = 'R'
+    CLOCK_SYNC = 'c'
+    CONTEXT_BEGIN = '('
+    CONTEXT_END = ')'
+
+base_time = None
+def event_us(msg):
+    assert(msg.default_clock_snapshot.clock_class.name == 'monotonic')
+    assert(msg.default_clock_snapshot.clock_class.frequency == 1e9)
+    global base_time
+    ns = msg.default_clock_snapshot.value
+    if base_time is None:
+        base_time = ns
+    return (ns - base_time) * 1e-3
+
+def lower(x):
+    if isinstance(x, str) or isinstance(x, int) or isinstance(x, float):
+        return x
+    if isinstance(x, dict) or isinstance(x, bt2._StructureFieldConst):
+        return {lower(k):lower(v) for k, v in x.items()}
+    if isinstance(x, bt2._BoolValueConst) or isinstance(x, bt2._BoolFieldConst):
+        return bool(x)
+    if isinstance(x, bt2._EnumerationFieldConst):
+        return repr(x)
+    if isinstance(x, bt2._IntegerValueConst) or isinstance(x, bt2._IntegerFieldConst):
+        return int(x)
+    if isinstance(x, bt2._RealValueConst) or isinstance(x, bt2._RealFieldConst):
+        return float(x)
+    if isinstance(x, bt2._StringValueConst) or isinstance(x, bt2._StringFieldConst):
+        return str(x)
+    raise ValueError("Unexpected value from trace", x)
+
+# Specific Whippet events.
+synthetic_begin = {}
+synthetic_end = {}
+for synthetic, [begin, end] in synthetic_events.items():
+    synthetic_begin[begin] = []
+    synthetic_end[end] = []
+for synthetic, [begin, end] in synthetic_events.items():
+    synthetic_begin[begin].append(synthetic)
+    synthetic_end[end].append(synthetic)
+
+def put(str):
+    sys.stdout.write(str)
+
+need_comma = False
+def print_event(ev):
+    global need_comma
+    if need_comma:
+        sys.stdout.write(',\n    ')
+    else:
+        need_comma = True
+    # It appears to be faster to make a string, then print the string,
+    # than to call json.dump with a file object.
+    # json.dump(ev, sys.stdout, ensure_ascii=False, check_circular=False)
+    put(json.dumps(ev, ensure_ascii=False, check_circular=False))
+
+def emit_event(msg, name, phase):
+    ev = {'name': name,
+          'cat': 'whippet',
+          'ph': phase.value,
+          'ts': event_us(msg),
+          'pid': lower(msg.event.common_context_field['vpid']),
+          'tid': lower(msg.event.common_context_field['vtid']),
+          'args': lower(msg.event.payload_field)}
+    print_event(ev)
+def emit_begin_event(msg, name):
+    emit_event(msg, name, EventPhase.BEGIN)
+def emit_end_event(msg, name):
+    emit_event(msg, name, EventPhase.END)
+
+def emit_events(msg):
+    emit_event(msg, msg.event.name, EventPhase.INSTANT)
+    for begin in synthetic_begin.get(msg.event.name, []):
+        emit_begin_event(msg, begin)
+    for end in synthetic_end.get(msg.event.name, []):
+        emit_end_event(msg, end)
+
+def ctf_to_json(path):
+    msg_it = bt2.TraceCollectionMessageIterator(path)
+    put('{\n')
+    put('  "traceEvents": [\n    ')
+    for msg in msg_it:
+        if hasattr(msg, 'event'):
+            emit_events(msg)
+    put('\n')
+    put('\n  ],\n')
+    put('  "displayTimeUnit": "ns"\n')
+    put('}\n')
+
+if len(sys.argv) != 2:
+    sys.stderr.write(
+        'usage: ' + sys.argv[0] + ' ~/lttng-traces/name-of-your-trace\n')
+    sys.exit(1)
+else:
+    ctf_to_json(sys.argv[1])
--- a/doc/manual.md
+++ b/doc/manual.md
@ -176,13 +176,14 @@ implementations of that API: `semi`, a simple semi-space collector;
 collector; and `mmc`, a mostly-marking collector inspired by Immix.

 The program that embeds Whippet selects the collector implementation at
-build-time.  In the case of the `mmc` collector, the program
-also configures a specific collector mode, again at build-time:
-generational or not, parallel or not, stack-conservative or not, and
-heap-conservative or not.  It may be nice in the future to be able to
-configure these at run-time, but for the time being they are
-compile-time options so that adding new features doesn't change the
-footprint of a more minimal collector.
+build-time.  For `pcc`, the program can also choose whether to be
+generational or not.  For `mmc` collector, the program configures a
+specific collector mode, again at build-time: generational or not,
+parallel or not, stack-conservative or not, and heap-conservative or
+not.  It may be nice in the future to be able to configure these at
+run-time, but for the time being they are compile-time options so that
+adding new features doesn't change the footprint of a more minimal
+collector.

 Different collectors have different allocation strategies: for example,
 the BDW collector allocates from thread-local freelists, whereas the
@ -199,97 +200,58 @@ compiling user code.

 ### Compiling the collector

-Building the collector is not as easy as it should be.  As an embed-only
-library, we don't get to choose the One True Build System and then just
-build the software in that way; instead Whippet needs to be buildable
-with any build system.  At some point we will have snippets that
-embedders can include in their various build systems, but for now we
-document the low-level structure, so that people can craft the
-appropriate incantations for their program's build system.
+As an embed-only library, Whippet needs to be integrated into the build
+system of its host (embedder).  Currently the only supported build
+system uses GNU make.  We would be happy to add other systems over time.

-Whippet consists of some collector-implementation-agnostic independent
-modules, and then the collector implementation itself.  Though Whippet
-tries to put performance-sensitive interfaces in header files, users
-should also compile with link-time optimization (LTO) to remove any
-overhead imposed by the division of code into separate compilation
-units.
+At a high level, first the embedder chooses a collector and defines how
+to specialize the collector against the embedder.  Whippet's `embed.mk`
+Makefile snippet then defines how to build the set of object files that
+define the collector, and how to specialize the embedder against the
+chosen collector.

-Usually you want to build with maximum optimization and no debugging
-assertions.  Sometimes you want minimal optimization and all assertions.
-Here's what we do, as a `Makefile` snippet:
+As an example, say you have a file `program.c`, and you want to compile
+it against a Whippet checkout in `whippet/`.  Your headers are in
+`include/`, and you have written an implementation of the embedder
+interface in `host-gc.h`.  In that case you would have a Makefile like
+this:

 ```
-DEFAULT_BUILD=opt
-BUILD_CFLAGS_opt=-O2 -g -DNDEBUG
-BUILD_CFLAGS_optdebug=-Og -g -DGC_DEBUG=1
-BUILD_CFLAGS_debug=-O0 -g -DGC_DEBUG=1
-BUILD_CFLAGS=$(BUILD_CFLAGS_$(or $(BUILD),$(DEFAULT_BUILD)))
+HOST_DIR:=$(dir $(lastword $(MAKEFILE_LIST)))
+WHIPPET_DIR=$(HOST_DIR)whippet/
+
+all: out
+
+# The collector to choose: e.g. semi, bdw, pcc, generational-pcc, mmc,
+# parallel-mmc, etc.
+GC_COLLECTOR=pcc
+
+include $(WHIPPET_DIR)embed.mk
+
+# Host cflags go here...
+HOST_CFLAGS=
+
+# Whippet's embed.mk uses this variable when it compiles code that
+# should be specialized against the embedder.
+EMBEDDER_TO_GC_CFLAGS=$(HOST_CFLAGS) -include $(HOST_DIR)host-gc.h
+
+program.o: program.c
+	$(GC_COMPILE) $(HOST_CFLAGS) $(GC_TO_EMBEDDER_CFLAGS) -c $<
+program: program.o $(GC_OBJS)
+	$(GC_LINK) $^ $(GC_LIBS)
 ```

-So if you do just plain `make`, it will do an `opt` build.  You can
-specify the build mode by setting `BUILD` on the command line, as in
-`make BUILD=debug`.
+The optimization settings passed to the C compiler are taken from
+`GC_BUILD_CFLAGS`.  Embedders can override this variable directly, or
+via the shorthand `GC_BUILD` variable.  A `GC_BUILD` of `opt` indicates
+maximum optimization and no debugging assertions; `optdebug` adds
+debugging assertions; and `debug` removes optimizations.

-Then for the actual compilation flags, we do:
-
-```
-CC=gcc
-CFLAGS=-Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(BUILD_CFLAGS)
-INCLUDES=-I.
-LDFLAGS=-lpthread -flto
-COMPILE=$(CC) $(CFLAGS) $(INCLUDES)
-```
-
-The actual include directory (the dot in `-I.`) should be adjusted as
-appropriate.
-
-#### Collector-implementation-agnostic independent modules
-
-There are currently four generic modules that don't depend on the choice
-of collector.  The first is `gc-stack.o`, which has supporting code to
-associate mutators (threads) with slices of the native stack, in order
-to support conservative root-finding.
-
-```
-$(COMPILE) -o gc-stack.o -c gc-stack.c
-```
-
-The next is a generic options interface, to allow the user to
-parameterize the collector at run-time, for example to implement a
-specific heap sizing strategy.
-
-```
-$(COMPILE) -o gc-options.o -c gc-options.c
-```
-
-Next, where Whippet needs to get data from the operating system, for
-example the number of processors available, it does so behind an
-abstract interface that is selected at compile-time.  The only
-implementation currently is for GNU/Linux, but it's a pretty thin layer,
-so adding more systems should not be difficult.
-
-```
-PLATFORM=gnu-linux
-$(COMPILE) -o gc-platform.o -c gc-platform-$(PLATFORM).c
-```
-
-Finally, something a little more complicated: ephemerons.  Ephemerons
-are objects that make a weak association between a key and a value.  As
-first-class objects, they need to be classifiable by the user system,
-and notably via the `gc_trace_object` procedure, and therefore need to
-have a header whose shape is understandable by the embedding program.
-We do this by including the `gc-embedder-api.h` implementation, via
-`-include`, in this case providing `foo-embedder.h`:
-
-```
-$(COMPILE) -include foo-embedder.h -o gc-ephemeron.o -c gc-ephemeron.c
-```
-
-As for ephemerons, finalizers also have their own compilation unit.
-
-```
-$(COMPILE) -include foo-embedder.h -o gc-finalizer.o -c gc-finalizer.c
-```
+Though Whippet tries to put performance-sensitive interfaces in header
+files, users should also compile with link-time optimization (LTO) to
+remove any overhead imposed by the division of code into separate
+compilation units.  `embed.mk` includes the necessary LTO flags in
+`GC_CFLAGS` and `GC_LDFLAGS`.

 #### Compile-time options

@ -316,82 +278,14 @@ Some collectors require specific compile-time options.  For example, the
 semi-space collector has to be able to move all objects; this is not
 compatible with conservative roots or heap edges.

-#### Building `semi`
+#### Tracing support

-Finally, let's build a collector.  The simplest collector is the
-semi-space collector.  The entirety of the implementation can be had by
-compiling `semi.c`, providing the program's embedder API implementation
-via `-include`:
-
-```
-$(COMPILE) -DGC_PRECISE_ROOTS=1 -include foo-embedder.h -o gc.o -c semi.c
-```
-
-#### Building `bdw`
-
-The next simplest collector uses
-[BDW-GC](https://github.com/ivmai/bdwgc).  This collector must scan the
-roots and heap conservatively.  The collector is parallel if BDW-GC
-itself was compiled with parallelism enabled.
-
-```
-$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 \
-  `pkg-config --cflags bdw-gc` \
-  -include foo-embedder.h -o gc.o -c bdw.c
-```
-
-#### Building `pcc`
-
-The parallel copying collector is like `semi` but better in every way:
-it supports multiple mutator threads, and evacuates in parallel if
-multiple threads are available.
-
-```
-$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 \
-  -include foo-embedder.h -o gc.o -c pcc.c
-```
-
-You can also build `pcc` in a generational configuration by passing
-`-DGC_GENERATIONAL=1`.  The nursery is 2 MB per active mutator, capped
-to the number of processors, so if the last cycle had a maximum of 4
-mutator threads active at the same time and your machine has 24 cores,
-your nursery would be 8 MB.
-
-#### Building `mmc`
-
-Finally, there is the mostly-marking collector.  It can collect roots
-precisely or conservatively, trace precisely or conservatively, be
-parallel or not, and be generational or not.
-
-```
-$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 \
-  -include foo-embedder.h -o gc.o -c mvv.c
-```
-
-### Compiling your program
-
-Any compilation unit that uses the GC API should have the same set of
-compile-time options defined as when compiling the collector.
-Additionally those compilation units should include the "attributes"
-header for the collector in question, namely `semi-attrs.h`,
-`bdw-attrs.h`, `pcc-attrs.h`, or `mmc-attrs.h`.  For example, for
-parallel generational mmc, you might have:
-
-```
-$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 \
-  -include mmc-attrs.h -o my-program.o -c my-program.c
-```
-
-### Linking the collector into your program
-
-Finally to link, pass all objects to the linker.  You will want to
-ensure that the linker enables `-flto`, for link-time optimization.  We
-do it like this:
-
-```
-$(CC) $(LDFLAGS) -o my-program \
-  my-program.o gc-stack.o gc-platform.o gc-options.o gc-ephemeron.o
-```
+Whippet includes support for low-overhead run-time tracing via
+[LTTng](https://lttng.org/).  If the support library `lttng-ust` is
+present when Whippet is compiled (as checked via `pkg-config`),
+tracepoint support will be present.  See
+[tracepoints.md](./tracepoints.md) for more information on how to get
+performance traces out of Whippet.

 ## Using the collector

--- a/doc/perfetto-minor-gc.png
+++ b/doc/perfetto-minor-gc.png
--- a/doc/tracepoints.md
+++ b/doc/tracepoints.md
@ -0,0 +1,126 @@
+# Whippet performance tracing
+
+Whippet includes support for run-time tracing via
+[LTTng](https://LTTng.org) user-space tracepoints.  This allows you to
+get a detailed look at how Whippet is performing on your system.
+Tracing support is currently limited to Linux systems.
+
+## Getting started
+
+First, you need to build Whippet with LTTng support.  Usually this is as
+easy as building it in an environment where the `lttng-ust` library is
+present, as determined by `pkg-config --libs lttng-ust`.  You can know
+if your Whippet has tracing support by seeing if the resulting binaries
+are dynamically linked to `liblttng-ust`.
+
+If we take as an example the `mt-gcbench` test in the Whippet source
+tree, we would have:
+
+```
+$ ldd bin/mt-gcbench.pcc | grep lttng
+...
+liblttng-ust.so.1 => ...
+...
+```
+
+### Capturing traces
+
+Actually capturing traces is a little annoying; it's not as easy as
+`perf run`.  The [LTTng
+documentation](https://lttng.org/docs/v2.13/#doc-controlling-tracing) is
+quite thorough, but here is a summary.
+
+First, create your tracing session:
+
+```
+$ lttng create
+Session auto-20250214-091153 created.
+Traces will be output to $HOME/lttng-traces/auto-20250214-091153
+```
+
+You run all these commands as your own user; they don't require root
+permissions or system-wide modifications, as all of the Whippet
+tracepoints are user-space tracepoints (UST).
+
+Just having an LTTng session created won't do anything though; you need
+to configure the session.  Monotonic nanosecond-resolution timestamps
+are already implicitly part of each event.  We also want to have process
+and thread IDs for all events:
+
+```
+$ lttng add-context --userspace --type=vpid --type=vtid
+ust context vpid added to all channels
+ust context vtid added to all channels
+```
+
+Now enable Whippet events:
+
+```
+$ lttng enable-event --userspace 'whippet:*'
+ust event whippet:* created in channel channel0
+```
+
+And now, start recording:
+
+```
+$ lttng start
+Tracing started for session auto-20250214-091153
+```
+
+With this, traces will be captured for our program of interest:
+
+```
+$ bin/mt-gcbench.pcc 2.5 8
+...
+```
+
+Now stop the trace:
+
+```
+$ lttng stop
+Waiting for data availability
+Tracing stopped for session auto-20250214-091153
+```
+
+Whew.  If we did it right, our data is now in
+$HOME/lttng-traces/auto-20250214-091153.
+
+### Visualizing traces
+
+LTTng produces traces in the [Common Trace Format
+(CTF)](https://diamon.org/ctf/).  My favorite trace viewing tool is the
+family of web-based trace viewers derived from `chrome://tracing`.  The
+best of these appear to be [the Firefox
+profiler](https://profiler.firefox.com) and
+[Perfetto](https://ui.perfetto.dev).  Unfortunately neither of these can
+work with CTF directly, so we instead need to run a trace converter.
+
+Oddly, there is no trace converter that can read CTF and write something
+that Perfetto (e.g.) can read.  However there is a JSON-based tracing
+format that Perfetto can read, and [Python bindings for Babeltrace, a
+library that works with CTF](https://babeltrace.org/), so that's what we
+do:
+
+```
+$ python3 ctf_to_json.py ~/lttng-traces/auto-20250214-091153 > trace.json
+```
+
+While Firefox Profiler can load this file, it works better on Perfetto,
+as the Whippet events are visually rendered on their respective threads.
+
+![Screenshot of part of Perfetto UI showing a minor GC](./perfetto-minor-gc.png)
+
+### Expanding the set of events
+
+As of February 2025,
+the current set of tracepoints includes the [heap
+events](https://github.com/wingo/whippet/blob/main/doc/manual.md#statistics)
+and some detailed internals of the parallel tracer.  We expect this set
+of tracepoints to expand over time.
+
+### Overhead of tracepoints
+
+When tracepoints are compiled in but no events are enabled, tracepoints
+appear to have no impact on run-time.  When event collection is on, for
+x86-64 hardware, [emitting a tracepoint event takes about
+100ns](https://discuss.systems/@DesnoyersMa/113986344940256872).