Skip to content

Commit

Permalink
Improve __crystal_once performance
Browse files Browse the repository at this point in the history
Based on the PR by @BlobCodes:
crystal-lang#15216

The performance improvement is the usage of a i8 instead of an i1
boolean to have 3 states instead of 2, which permits to quickly detect
recursive calls without an array + inline tricks to optimize the fast
and slow paths.

Unlike the PR:

1. Removes the need for a state maintained by the compiler. This keeps
   the ability for an older compiler to compile a new release of the
   compiler (or use a newer stdlib) but breaks the ability for a new
   compiler to compile an older release (or use an older stdlib)!
2. Doesn't use atomics: we still use a mutex that already guarantees the
   acquire/release memory ordering semantics, and __crystal_once_init is
   only ever called once in the main thread before any other thread can
   be started.
  • Loading branch information
ysbaddaden committed Jan 6, 2025
1 parent 11cf206 commit cf49927
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 70 deletions.
6 changes: 3 additions & 3 deletions src/compiler/crystal/codegen/class_var.cr
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ class Crystal::CodeGenVisitor
initialized_flag_name = class_var_global_initialized_name(class_var)
initialized_flag = @main_mod.globals[initialized_flag_name]?
unless initialized_flag
initialized_flag = @main_mod.globals.add(@main_llvm_context.int1, initialized_flag_name)
initialized_flag.initializer = @main_llvm_context.int1.const_int(0)
initialized_flag = @main_mod.globals.add(@main_llvm_context.int8, initialized_flag_name)
initialized_flag.initializer = @main_llvm_context.int8.const_int(0)
initialized_flag.linkage = LLVM::Linkage::Internal if @single_module
initialized_flag.thread_local = true if class_var.thread_local?
end
Expand Down Expand Up @@ -61,7 +61,7 @@ class Crystal::CodeGenVisitor
initialized_flag_name = class_var_global_initialized_name(class_var)
initialized_flag = @llvm_mod.globals[initialized_flag_name]?
unless initialized_flag
initialized_flag = @llvm_mod.globals.add(llvm_context.int1, initialized_flag_name)
initialized_flag = @llvm_mod.globals.add(llvm_context.int8, initialized_flag_name)
initialized_flag.thread_local = true if class_var.thread_local?
end
end
Expand Down
4 changes: 2 additions & 2 deletions src/compiler/crystal/codegen/const.cr
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ class Crystal::CodeGenVisitor
initialized_flag_name = const.initialized_llvm_name
initialized_flag = @main_mod.globals[initialized_flag_name]?
unless initialized_flag
initialized_flag = @main_mod.globals.add(@main_llvm_context.int1, initialized_flag_name)
initialized_flag.initializer = @main_llvm_context.int1.const_int(0)
initialized_flag = @main_mod.globals.add(@main_llvm_context.int8, initialized_flag_name)
initialized_flag.initializer = @main_llvm_context.int8.const_int(0)
initialized_flag.linkage = LLVM::Linkage::Internal if @single_module
end
initialized_flag
Expand Down
26 changes: 3 additions & 23 deletions src/compiler/crystal/codegen/once.cr
Original file line number Diff line number Diff line change
@@ -1,37 +1,17 @@
require "./codegen"

class Crystal::CodeGenVisitor
ONCE_STATE = "~ONCE_STATE"

def once_init
if once_init_fun = typed_fun?(@main_mod, ONCE_INIT)
once_init_fun = check_main_fun ONCE_INIT, once_init_fun

once_state_global = @main_mod.globals.add(once_init_fun.type.return_type, ONCE_STATE)
once_state_global.linkage = LLVM::Linkage::Internal if @single_module
once_state_global.initializer = once_init_fun.type.return_type.null

state = call once_init_fun
store state, once_state_global
call once_init_fun
end
end

def run_once(flag, func : LLVMTypedFunction)
once_fun = main_fun(ONCE)
once_init_fun = main_fun(ONCE_INIT)

# both of these should be Void*
once_state_type = once_init_fun.type.return_type
once_initializer_type = once_fun.func.params.last.type

once_state_global = @llvm_mod.globals[ONCE_STATE]? || begin
global = @llvm_mod.globals.add(once_state_type, ONCE_STATE)
global.linkage = LLVM::Linkage::External
global
end

state = load(once_state_type, once_state_global)
once_initializer_type = once_fun.func.params.last.type # must be Void*
initializer = pointer_cast(func.func.to_value, once_initializer_type)
call once_fun, [state, flag, initializer]
call once_fun, [flag, initializer]
end
end
155 changes: 113 additions & 42 deletions src/crystal/once.cr
Original file line number Diff line number Diff line change
@@ -1,54 +1,125 @@
# This file defines the functions `__crystal_once_init` and `__crystal_once` expected
# by the compiler. `__crystal_once` is called each time a constant or class variable
# has to be initialized and is its responsibility to verify the initializer is executed
# only once. `__crystal_once_init` is executed only once at the beginning of the program
# and the result is passed on each call to `__crystal_once`.

# This implementation uses an array to store the initialization flag pointers for each value
# to find infinite loops and raise an error. In multithread mode a mutex is used to
# avoid race conditions between threads.

# :nodoc:
class Crystal::OnceState
@rec = [] of Bool*

def once(flag : Bool*, initializer : Void*)
unless flag.value
if @rec.includes?(flag)
raise "Recursion while initializing class variables and/or constants"
# This file defines two functions expected by the compiler:
#
# - `__crystal_once_init`: executed only once at the beginning of the program
# and, for the legacy implementation, the result is passed on each call to
# `__crystal_once`.
#
# - `__crystal_once`: called each time a constant or class variable has to be
# initialized and is its responsibility to verify the initializer is executed
# only once and to fail on recursion.

# In multithread mode a mutex is used to avoid race conditions between threads.
#
# On Win32, `Crystal::System::FileDescriptor#@@reader_thread` spawns a new
# thread even without the `preview_mt` flag, and the thread can also reference
# Crystal constants, leading to race conditions, so we always enable the mutex.

{% if compare_versions(Crystal::VERSION, "1.15.0-dev") >= 0 %}
# This implementation uses an enum over the initialization flag pointer for
# each value to find infinite loops and raise an error.

module Crystal
enum OnceState : Int8
Processing = -1
Uninitialized = 0
Initialized = 1
end

{% if flag?(:preview_mt) || flag?(:win32) %}
@@once_mutex = uninitialized Mutex

def self.once_mutex : Mutex
@@once_mutex
end
@rec << flag

Proc(Nil).new(initializer, Pointer(Void).null).call
flag.value = true
def self.once_mutex=(@@once_mutex : Mutex)
end
{% end %}

@rec.pop
# Using @[NoInline] so LLVM optimizes for the hot path (var already
# initialized).
@[NoInline]
def self.once(flag : OnceState*, initializer : Void*) : Nil
{% if flag?(:preview_mt) || flag?(:win32) %}
Crystal.once_mutex.synchronize { once_exec(flag, initializer) }
{% else %}
once_exec(flag, initializer)
{% end %}
end

private def self.once_exec(flag : OnceState*, initializer : Void*) : Nil
case flag.value
in .initialized?
return
in .uninitialized?
flag.value = :processing
Proc(Nil).new(initializer, Pointer(Void).null).call
flag.value = :initialized
in .processing?
raise "Recursion while initializing class variables and/or constants"
end
end
end

# on Win32, `Crystal::System::FileDescriptor#@@reader_thread` spawns a new
# thread even without the `preview_mt` flag, and the thread can also reference
# Crystal constants, leading to race conditions, so we always enable the mutex
# TODO: can this be improved?
{% if flag?(:preview_mt) || flag?(:win32) %}
@mutex = Mutex.new(:reentrant)
# :nodoc:
fun __crystal_once_init : Nil
{% if flag?(:preview_mt) || flag?(:win32) %}
Crystal.once_mutex = Mutex.new(:reentrant)
{% end %}
end

# :nodoc:
#
# Using `@[AlwaysInline]` allows LLVM to optimize const accesses. Since this
# is a `fun` the function will still appear in the symbol table, though it
# will never be called.
@[AlwaysInline]
fun __crystal_once(flag : Int8*, initializer : Void*) : Nil
flag = flag.as(Crystal::OnceState*)
Crystal.once(flag, initializer) unless flag.value.initialized?
end
{% else %}
# This implementation uses a global array to store the initialization flag
# pointers for each value to find infinite loops and raise an error.

# :nodoc:
class Crystal::OnceState
@rec = [] of Bool*

def once(flag : Bool*, initializer : Void*)
unless flag.value
@mutex.synchronize do
previous_def
if @rec.includes?(flag)
raise "Recursion while initializing class variables and/or constants"
end
@rec << flag

Proc(Nil).new(initializer, Pointer(Void).null).call
flag.value = true

@rec.pop
end
end
{% end %}
end

# :nodoc:
fun __crystal_once_init : Void*
Crystal::OnceState.new.as(Void*)
end

# :nodoc:
fun __crystal_once(state : Void*, flag : Bool*, initializer : Void*)
state.as(Crystal::OnceState).once(flag, initializer)
end

{% if flag?(:preview_mt) || flag?(:win32) %}
@mutex = Mutex.new(:reentrant)

def once(flag : Bool*, initializer : Void*)
unless flag.value
@mutex.synchronize do
previous_def
end
end
end
{% end %}
end

# :nodoc:
fun __crystal_once_init : Void*
Crystal::OnceState.new.as(Void*)
end

# :nodoc:
fun __crystal_once(state : Void*, flag : Bool*, initializer : Void*)
state.as(Crystal::OnceState).once(flag, initializer)
end
{% end %}

2 comments on commit cf49927

@BlobCodes
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've worked on another version of the ocne mechanism a few days ago that doesn't require compiler changes and doesn't require memory allocations even with mt enabled (since the original __crystal_once re-do was for an embedded stdlib).
I probably have to re-do the atomic fences and clean up the code though.

crystal-lang/crystal@master...BlobCodes:crystal:perf/crystal-once-v2


The mentioned branch adds the following line in fun __crystal_once:

__crystal_once_unreachable unless flag.value

I noticed that adding this code prevents LLVM from re-running the once mechanism multiple times for the same variable.

@ysbaddaden
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@BlobCodes Oh, the unreachable trick is weird yet interesting. I feel most of your changes are about inlining the Mutex in order to spare allocations?

On this topic, I wrote a patch yesterday to replace the Deque with a PointerLinkedList in Mutex. I'll push it later today or tomorrow. We could also try ReferenceStorage and unsafe_construct to inline the Mutex. Then __crystal_once_init would be allocation free.

I also have an in-progress patch to replace Thread::Mutex and Mutex to use the algorithm from nsync as used in Cosmopolitan (see https://justine.lol/mutex) and Crystal::SpinLock and Crystal::RWLock are likely to use it too (it's magnitude more efficient).

Please sign in to comment.