-
-
Notifications
You must be signed in to change notification settings - Fork 327
/
SynFPCx64MM.pas
3420 lines (3189 loc) · 127 KB
/
SynFPCx64MM.pas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/// Fast Memory Manager for FPC x86_64
// - this unit is a part of the freeware Synopse mORMot framework
// licensed under a MPL/GPL/LGPL three license - see LICENSE.md
unit SynFPCx64MM;
{
*****************************************************************************
A Multi-thread Friendly Memory Manager for FPC written in x86_64 assembly
- targetting Linux (and Windows) multi-threaded Services
- only for FPC on the x86_64 target - use the RTL MM on Delphi or ARM
- based on proven FastMM4 by Pierre le Riche - with tuning and enhancements
- can report detailed statistics (with threads contention and memory leaks)
- three app modes: default mono-thread friendly, FPCMM_SERVER or FPCMM_BOOST
Usage: include this unit as the very first in your FPC project uses clause
Why another Memory Manager on FPC?
- The built-in heap.inc is well written and cross-platform and cross-CPU,
but its threadvar arena for small blocks tends to consume a lot of memory
on multi-threaded servers, and has suboptimal allocation performance
- C memory managers (glibc, Intel TBB, jemalloc) have a very high RAM
consumption (especially Intel TBB) and do panic/SIGKILL on any GPF
- Pascal alternatives (FastMM4,ScaleMM2,BrainMM) are Windows+Delphi specific
- Our lockess round-robin of tiny blocks and freemem bin list are unique
algorithms among Memory Managers, and match modern CPUs and workloads
- It was so fun diving into SSE2 x86_64 assembly and Pierre's insight
- Resulting code is still easy to understand and maintain
DISCLAMER: seems stable on Linux and Win64 but feedback is welcome!
*****************************************************************************
This file is part of Synopse framework.
Synopse framework. Copyright (c) Arnaud Bouchez
Synopse Informatique - https://synopse.info
*** BEGIN LICENSE BLOCK *****
Version: MPL 1.1/GPL 2.0/LGPL 2.1
The contents of this file are subject to the Mozilla Public License Version
1.1 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.mozilla.org/MPL
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
for the specific language governing rights and limitations under the License.
The Original Code is Synopse mORMot framework.
The Initial Developer of the Original Code is Arnaud Bouchez.
Portions created by the Initial Developer are Copyright (c)
the Initial Developer. All Rights Reserved.
Contributor(s):
Alternatively, the contents of this file may be used under the terms of
either the GNU General Public License Version 2 or later (the "GPL"), or
the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
in which case the provisions of the GPL or the LGPL are applicable instead
of those above. If you wish to allow use of your version of this file only
under the terms of either the GPL or the LGPL, and not to allow others to
use your version of this file under the terms of the MPL, indicate your
decision by deleting the provisions above and replace them with the notice
and other provisions required by the GPL or the LGPL. If you do not delete
the provisions above, a recipient may use your version of this file under
the terms of any one of the MPL, the GPL or the LGPL.
***** END LICENSE BLOCK *****
}
{ ---- Ready-To-Use Scenarios for Memory Manager Tuning }
{
TL;DR:
1. default settings target LCL/console mono-threaded apps;
2. define FPCMM_SERVER for a multi-threaded service/daemon.
}
// target a multi-threaded service on a modern CPU
// - define FPCMM_DEBUG, FPCMM_ASSUMEMULTITHREAD, FPCMM_ERMS, FPCMM_LOCKLESSFREE
// - currently mormot2tests run with no sleep when FPCMM_SERVER is set :)
// - you may try to define FPCMM_BOOST for even more aggressive settings.
{.$define FPCMM_SERVER}
// increase settings for very aggressive multi-threaded process
// - try to enable it if unexpected SmallGetmemSleepCount/SmallFreememSleepCount
// and SleepCount/SleepCycles contentions are reported by CurrentHeapStatus;
// - tiny blocks will be <= 256 bytes (instead of 128 bytes);
// - FPCMM_BOOSTER will use 2x more tiny blocks arenas - likely to be wasteful;
// - will enable FPCMM_SMALLNOTWITHMEDIUM trying to reduce medium sleeps;
// - warning: depending on the workload and hardware, it may actually be slower,
// triggering more Meidum arena contention, and consuming more RAM: consider
// FPCMM_SERVER as a fair alternative.
{.$define FPCMM_BOOST}
{.$define FPCMM_BOOSTER}
{ ---- Fine Grained Memory Manager Tuning }
// includes more detailed information to WriteHeapStatus()
{.$define FPCMM_DEBUG}
// on thread contention, don't spin executing "pause" but directly call Sleep()
// - may help on a single core CPU, or for very specific workloads
{.$define FPCMM_NOPAUSE}
// let FPCMM_DEBUG include SleepCycles information from rdtsc
// and FPCMM_PAUSE call rdtsc for its spinnning loop
// - since rdtsc is emulated so unrealiable on VM, it is disabled by default
{.$define FPCMM_SLEEPTSC}
// checks leaks and write them to the console at process shutdown
// - only basic information will be included: more debugging information (e.g.
// call stack) may be gathered using heaptrc or valgrid
{.$define FPCMM_REPORTMEMORYLEAKS}
// won't check the IsMultiThread global, but assume it is true
// - multi-threaded apps (e.g. a Server Daemon instance) will be faster with it
// - mono-threaded (console/LCL) apps are faster without this conditional
{.$define FPCMM_ASSUMEMULTITHREAD}
// let Freemem multi-thread contention use a lockless algorithm
// - on contention, Freemem won't yield the thread using an OS call, but fill
// an internal Bin list which will be released when the lock becomes available
// - beneficial from our tests on high thread contention (HTTP/REST server)
{.$define FPCMM_LOCKLESSFREE}
// won't use mremap but a regular getmem/move/freemem pattern for large blocks
// - depending on the actual system (e.g. on a VM), mremap may be slower
// - will disable Linux mremap() or Windows following block VirtualQuery/Alloc
{.$define FPCMM_NOMREMAP}
// force the tiny/small blocks to be in their own arena, not with medium blocks
// - would use a little more memory, but medium pool is less likely to sleep
// - not defined for FPCMM_SERVER because no performance difference was found
{.$define FPCMM_SMALLNOTWITHMEDIUM}
// use "rep movsb/stosd" ERMS for blocks > 256 bytes instead of SSE2 "movaps"
// - ERMS is available since Ivy Bridge, and we use "movaps" for smallest blocks
// (to not slow down older CPUs), so it is safe to enable this on FPCMM_SERVER
{.$define FPCMM_ERMS}
// try "cmp" before "lock cmpxchg" for old processors with huge lock penalty
{.$define FPCMM_CMPBEFORELOCK}
// will export libc-like functions, and not replace the FPC MM
// - e.g. to use this unit as a stand-alone C memory allocator
{.$define FPCMM_STANDALONE}
// this whole unit will compile as void
// - may be defined e.g. when compiled as Design-Time Lazarus package
{.$define FPCMM_DISABLE}
interface
{$ifdef FPC}
// cut-down version of Synopse.inc to make this unit standalone
{$mode Delphi}
{$inline on}
{$R-} // disable Range checking
{$S-} // disable Stack checking
{$W-} // disable stack frame generation
{$Q-} // disable overflow checking
{$B-} // expect short circuit boolean
{$ifdef CPUX64}
{$define FPCX64MM} // this unit is for FPC + x86_64 only
{$asmmode Intel}
{$endif CPUX64}
{$ifdef FPCMM_BOOSTER}
{$define FPCMM_BOOST}
{$endif FPCMM_BOOSTER}
{$ifdef FPCMM_BOOST}
{$define FPCMM_SERVER}
{$define FPCMM_SMALLNOTWITHMEDIUM}
{$endif FPCMM_BOOST}
{$ifdef FPCMM_SERVER}
{$define FPCMM_DEBUG}
{$define FPCMM_ASSUMEMULTITHREAD}
{$define FPCMM_LOCKLESSFREE}
{$define FPCMM_ERMS}
{$endif FPCMM_SERVER}
{$ifdef FPCMM_BOOSTER}
{$undef FPCMM_DEBUG} // when performance matters more than stats
{$endif FPCMM_BOOSTER}
{$endif FPC}
{$ifdef FPCMM_DISABLE}
{$undef FPCX64MM} // e.g. when compiled as Design-Time Lazarus package
{$endif FPCMM_DISABLE}
{$ifdef FPCX64MM}
// this unit is available only for FPC + X86_64 CPU
// other targets would compile as a void unit
type
/// Arena (middle/large) heap information as returned by CurrentHeapStatus
TMMStatusArena = record
/// how many bytes are currently reserved (mmap) to the Operating System
CurrentBytes: PtrUInt;
/// how many bytes have been reserved (mmap) to the Operating System
CumulativeBytes: PtrUInt;
{$ifdef FPCMM_DEBUG}
/// maximum bytes count reserved (mmap) to the Operating System
PeakBytes: PtrUInt;
/// how many VirtualAlloc/mmap calls to the Operating System did occur
CumulativeAlloc: PtrUInt;
/// how many VirtualFree/munmap calls to the Operating System did occur
CumulativeFree: PtrUInt;
{$endif FPCMM_DEBUG}
/// how many times this Arena did wait from been unlocked by another thread
SleepCount: PtrUInt;
end;
/// heap information as returned by CurrentHeapStatus
TMMStatus = record
/// how many tiny/small memory blocks (<=2600 bytes) are currently allocated
SmallBlocks: PtrUInt;
/// how many bytes of tiny/small memory blocks are currently allocated
// - this size is included in Medium.CurrentBytes value, even if
// FPCMM_SMALLNOTWITHMEDIUM has been defined
SmallBlocksSize: PtrUInt;
/// information about blocks up to 256KB (tiny, small and medium)
// - includes also the memory needed for tiny/small blocks
Medium: TMMStatusArena;
/// information about large blocks > 256KB
// - those blocks are directly handled by the Operating System
Large: TMMStatusArena;
{$ifdef FPCMM_DEBUG}
{$ifdef FPCMM_SLEEPTSC}
/// how much rdtsc cycles were spent within SwitchToThread/NanoSleep API
// - we rdtsc since it is an indicative but very fast way of timing on
// direct hardware
// - warning: on virtual machines, the rdtsc opcode is usually emulated so
// these SleepCycles number are non indicative anymore
SleepCycles: PtrUInt;
{$endif FPCMM_SLEEPTSC}
{$ifdef FPCMM_LOCKLESSFREE}
/// how many times Freemem() did spin to acquire its lock-less bin list
SmallFreememLockLessSpin: PtrUInt;
{$endif FPCMM_LOCKLESSFREE}
{$endif FPCMM_DEBUG}
/// how many times the Operating System Sleep/NanoSleep API was called
// - in a perfect world, should be as small as possible: mormot2tests run as
// SleepCount=0 when FPCMM_LOCKLESSFREE is defined (FPCMM_SERVER default)
SleepCount: PtrUInt;
/// how many times Getmem() did block and wait for a tiny/small block
// - see also GetSmallBlockContention() for more detailed information
SmallGetmemSleepCount: PtrUInt;
/// how many times Freemem() did block and wait for a tiny/small block
// - see also GetSmallBlockContention() for more detailed information
SmallFreememSleepCount: PtrUInt;
end;
PMMStatus = ^TMMStatus;
/// allocate a new memory buffer
// - as FPC default heap, _Getmem(0) returns _Getmem(1)
function _GetMem(size: PtrUInt): pointer;
/// allocate a new zeroed memory buffer
function _AllocMem(Size: PtrUInt): pointer;
/// release a memory buffer
// - returns the allocated size of the supplied pointer (as FPC default heap)
function _FreeMem(P: pointer): PtrUInt;
/// change the size of a memory buffer
// - won't move any data if in-place reallocation is possible
// - as FPC default heap, _ReallocMem(P=nil,Size) maps P := _getmem(Size) and
// _ReallocMem(P,0) maps _Freemem(P)
function _ReallocMem(var P: pointer; Size: PtrUInt): pointer;
/// retrieve the maximum size (i.e. the allocated size) of a memory buffer
function _MemSize(P: pointer): PtrUInt; inline;
/// retrieve high-level statistics about the current memory manager state
// - see also GetSmallBlockContention for detailed small blocks information
// - standard GetHeapStatus and GetFPCHeapStatus gives less accurate information
// (only CurrHeapSize and MaxHeapSize are set), since we don't track "free" heap
// bytes: I can't figure how "free" memory is relevant nowadays - on 21th century
// Operating Systems, memory is virtual, and reserved/mapped by the OS but
// physically hosted in the HW RAM chips only when written the first time -
// GetHeapStatus information made sense on MSDOS with fixed 640KB of RAM
// - note that FPC GetHeapStatus and GetFPCHeapStatus is only about the
// current thread (irrelevant for sure) whereas CurrentHeapStatus is global
function CurrentHeapStatus: TMMStatus;
{$ifdef FPCMM_STANDALONE}
/// should be called before using any memory function
procedure InitializeMemoryManager;
/// should be called to finalize this memory manager process and release all RAM
procedure FreeAllMemory;
{$undef FPCMM_DEBUG} // excluded FPC-specific debugging
/// IsMultiThread global variable is not correct outside of the FPC RTL
{$define FPCMM_ASSUMEMULTITHREAD}
/// not supported to reduce dependencies and console writing
{$undef FPCMM_REPORTMEMORYLEAKS}
{$else}
type
/// one GetSmallBlockContention info about unexpected multi-thread waiting
// - a single GetmemBlockSize or FreememBlockSize non 0 field is set
TSmallBlockContention = packed record
/// how many times a small block getmem/freemem has been waiting for unlock
SleepCount: cardinal;
/// the small block size on which Getmem() has been blocked - or 0
GetmemBlockSize: cardinal;
/// the small block size on which Freemem() has been blocked - or 0
FreememBlockSize: cardinal;
end;
/// small blocks detailed information as returned GetSmallBlockContention
TSmallBlockContentionDynArray = array of TSmallBlockContention;
/// one GetSmallBlockStatus information
TSmallBlockStatus = packed record
/// how many times a memory block of this size has been allocated
Total: cardinal;
/// how many memory blocks of this size are currently allocated
Current: cardinal;
/// the standard size of the small memory block
BlockSize: cardinal;
end;
/// small blocks detailed information as returned GetSmallBlockStatus
TSmallBlockStatusDynArray = array of TSmallBlockStatus;
/// sort order of detailed information as returned GetSmallBlockStatus
TSmallBlockOrderBy = (obTotal, obCurrent, obBlockSize);
/// retrieve the use counts of allocated small blocks
// - returns maxcount biggest results, sorted by "orderby" field occurence
function GetSmallBlockStatus(maxcount: integer = 10;
orderby: TSmallBlockOrderBy = obTotal; count: PPtrUInt = nil; bytes: PPtrUInt = nil;
small: PCardinal = nil; tiny: PCardinal = nil): TSmallBlockStatusDynArray;
/// retrieve all small blocks which suffered from blocking during multi-thread
// - returns maxcount biggest results, sorted by SleepCount occurence
function GetSmallBlockContention(
maxcount: integer = 10): TSmallBlockContentionDynArray;
/// convenient debugging function into the console
// - if smallblockcontentioncount > 0, includes GetSmallBlockContention() info
// up to the smallblockcontentioncount biggest occurences
procedure WriteHeapStatus(const context: shortstring = '';
smallblockstatuscount: integer = 8; smallblockcontentioncount: integer = 8;
compilationflags: boolean = false);
/// convenient debugging function into a string
// - if smallblockcontentioncount > 0, includes GetSmallBlockContention() info
// up to the smallblockcontentioncount biggest occurences
// - warning: this function is not thread-safe
function GetHeapStatus(const context: shortstring; smallblockstatuscount,
smallblockcontentioncount: integer; compilationflags, onsameline: boolean): string;
const
/// human readable information about how our MM was built
// - similar to WriteHeapStatus(compilationflags=true) output
FPCMM_FLAGS = ' '
{$ifdef FPCMM_BOOSTER} + 'BOOSTER ' {$else}
{$ifdef FPCMM_BOOST} + 'BOOST ' {$else}
{$ifdef FPCMM_SERVER} + 'SERVER ' {$endif}
{$endif}
{$endif}
{$ifdef FPCMM_ASSUMEMULTITHREAD} + ' assumulthrd' {$endif}
{$ifdef FPCMM_LOCKLESSFREE} + ' lockless' {$endif}
{$ifdef FPCMM_PAUSE} + ' pause' {$endif}
{$ifdef FPCMM_SLEEPTSC} + ' rdtsc' {$endif}
{$ifndef BSD}
{$ifdef FPCMM_NOMREMAP} + ' nomremap' {$endif}
{$endif BSD}
{$ifdef FPCMM_SMALLNOTWITHMEDIUM}+ ' smallpool' {$endif}
{$ifdef FPCMM_ERMS} + ' erms' {$endif}
{$ifdef FPCMM_DEBUG} + ' debug' {$endif}
{$ifdef FPCMM_REPORTMEMORYLEAKS} + ' repmemleak' {$endif};
{$endif FPCMM_STANDALONE}
{$endif FPCX64MM}
implementation
{
High-level Allocation Strategy Description
--------------------------------------------
The allocator handles the following families of memory blocks:
- TINY <= 128 B (<= 256 B for FPCMM_BOOST)
Round-robin distribution into several arenas, fed from shared tiny/small pool
(fair scaling from multi-threaded calls, with no threadvar nor GC involved)
- SMALL <= 2600 B
One arena per block size, fed from shared tiny/small pool
- MEDIUM <= 256 KB
Separated pool of bitmap-marked chunks, fed from 1MB of OS mmap/virtualalloc
- LARGE > 256 KB
Directly fed from OS mmap/virtualalloc with mremap when growing
The original FastMM4 was enhanced as such, especially in FPCMM_SERVER mode:
- FPC compatibility, even on POSIX/Linux, also for FPC specific API behavior;
- x86_64 code was refactored and tuned in regard to 2020's hardware;
- Inlined SSE2 movaps loop or ERMS are more efficient that subfunction(s);
- New round-robin thread-friendly arenas of tiny blocks;
- Tiny and small blocks can fed from their own pool, not the medium pool;
- Additional bin list to reduce small/tiny Freemem() thread contention;
- Memory leaks and thread sleep tracked with almost no performance loss;
- Large blocks logic has been rewritten, especially realloc;
- On Linux, mremap is used for efficient realloc of large blocks.
About locking:
- Tiny and Small blocks have their own per-size lock;
- Tiny and Small blocks have one giant lock when fedding from their pool;
- Medium blocks have one giant lock over their own pool;
- Large blocks have one giant lock over mmap/virtualalloc system calls;
- SwitchToThread/FpNanoSleep OS call is done after initial spinning;
- FPCMM_LOCKLESSFREE reduces Freemem() thread contention;
- FPCMM_DEBUG / WriteHeapStatus helps identifying the lock contention(s).
}
{$ifdef FPCX64MM}
// this unit is available only for FPC + X86_64 CPU
{$ifndef FPCMM_NOPAUSE}
// on contention problem, execute "pause" opcode and spin retrying the lock
// - defined by default to follow Intel recommendatations from
// https://software.intel.com/content/www/us/en/develop/articles/benefitting-power-and-performance-sleep-loops.html
// - spinning loop is either using constants or rdtsc (if FPCMM_SLEEPTSC is set)
// - on SkylakeX (Intel 7th gen), "pause" opcode went from 10-20 to 140 cycles
// so our constants below will favor those latest CPUs with a longer pause
{$define FPCMM_PAUSE}
{$endif FPCMM_NOPAUSE}
{ ********* Operating System Specific API Calls }
{$ifdef MSWINDOWS}
// Win64: any assembler function with sub-calls should have a stack frame
// -> nostackframe is defined only on Linux or for functions with no nested call
{$undef NOSFRAME}
const
kernel32 = 'kernel32.dll';
MEM_COMMIT = $1000;
MEM_RESERVE = $2000;
MEM_RELEASE = $8000;
MEM_FREE = $10000;
MEM_TOP_DOWN = $100000;
PAGE_READWRITE = 4;
PAGE_GUARD = $0100;
PAGE_VALID = $00e6; // PAGE_READONLY or PAGE_READWRITE or PAGE_EXECUTE or
// PAGE_EXECUTE_READ or PAGE_EXECUTE_READWRITE or PAGE_EXECUTE_WRITECOPY
type
// VirtualQuery() API result structure
TMemInfo = record
BaseAddress, AllocationBase: pointer;
AllocationProtect: cardinal;
PartitionId: word;
RegionSize: PtrUInt;
State, Protect, MemType: cardinal;
end;
function VirtualAlloc(lpAddress: pointer;
dwSize: PtrUInt; flAllocationType, flProtect: cardinal): pointer;
stdcall; external kernel32 name 'VirtualAlloc';
function VirtualFree(lpAddress: pointer; dwSize: PtrUInt;
dwFreeType: cardinal): LongBool;
stdcall; external kernel32 name 'VirtualFree';
procedure SwitchToThread;
stdcall; external kernel32 name 'SwitchToThread';
function VirtualQuery(lpAddress, lpMemInfo: pointer; dwLength: PtrUInt): PtrUInt;
stdcall; external kernel32 name 'VirtualQuery';
function AllocMedium(Size: PtrInt): pointer; inline;
begin
// bottom-up allocation to reduce fragmentation
result := VirtualAlloc(nil, Size, MEM_COMMIT, PAGE_READWRITE);
end;
function AllocLarge(Size: PtrInt): pointer; inline;
begin
// FastMM4 uses top-down allocation (MEM_TOP_DOWN) of large blocks to "reduce
// fragmentation", but on a 64-bit system I am not sure of this statement, and
// VirtualAlloc() was reported to have a huge slowdown due to this option
// https://randomascii.wordpress.com/2011/08/05/making-virtualalloc-arbitrarily-slower
result := VirtualAlloc(nil, Size, MEM_COMMIT, PAGE_READWRITE);
end;
procedure FreeMediumLarge(ptr: pointer; Size: PtrInt); inline;
begin
VirtualFree(ptr, 0, MEM_RELEASE);
end;
{$ifndef FPCMM_NOMREMAP}
function RemapLarge(addr: pointer; old_len, new_len: size_t): pointer;
var
meminfo: TMemInfo;
next: pointer;
nextsize: PtrUInt;
begin
// old_len and new_len have 64KB granularity, so match Windows page size
nextsize := new_len - old_len;
if PtrInt(nextsize) > 0 then
begin
// try to allocate the memory just after the existing one
FillChar(meminfo, SizeOf(meminfo), 0);
next := addr + old_len;
if (VirtualQuery(next, @meminfo, SizeOf(meminfo)) = SizeOf(meminfo)) and
(meminfo.State = MEM_FREE) and
(meminfo.RegionSize >= nextsize) and // enough space?
// reserve the address space in two steps for thread safety
(VirtualAlloc(next, nextsize, MEM_RESERVE, PAGE_READWRITE) <> nil) and
(VirtualAlloc(next, nextsize, MEM_COMMIT, PAGE_READWRITE) <> nil) then
begin
result := addr; // in-place realloc: no need to move memory :)
exit;
end;
end;
// we need to use the slower but safe Alloc/Move/Free pattern
result := AllocLarge(new_len);
if new_len > old_len then
new_len := old_len; // handle size up or down
Move(addr^, result^, new_len); // RTL non-volatile asm or our AVX MoveFast()
FreeMediumLarge(addr, old_len);
end;
{$endif FPCMM_NOMREMAP}
// experimental VirtualQuery detection of object class - use at your own risk
{$define FPCMM_REPORTMEMORYLEAKS_EXPERIMENTAL}
{$else}
uses
{$ifndef DARWIN}
syscall,
{$endif DARWIN}
BaseUnix;
// in practice, SYSV ABI seems to not require a stack frame, as Win64 does, for
// our use case of nested calls with no local stack storage and direct kernel
// syscalls - but since it is clearly undocumented, we set it on LINUX only
// -> appears to work with no problem from our tests: feedback is welcome!
// -> see FPCMM_NOSFRAME conditional to disable it on LINUX
{$ifdef LINUX}
{$define NOSFRAME}
{$endif LINUX}
// we directly call the OS Kernel, so this unit doesn't require any libc
function AllocMedium(Size: PtrInt): pointer; inline;
begin
result := fpmmap(nil, Size, PROT_READ or PROT_WRITE,
MAP_PRIVATE or MAP_ANONYMOUS, -1, 0);
end;
function AllocLarge(Size: PtrInt): pointer; inline;
begin
result := AllocMedium(size); // same API (no MEM_TOP_DOWN option)
end;
procedure FreeMediumLarge(ptr: pointer; Size: PtrInt); inline;
begin
fpmunmap(ptr, Size);
end;
{$ifdef LINUX}
{$ifndef FPCMM_NOMREMAP}
const
syscall_nr_mremap = 25; // valid on x86_64 Linux and Android
MREMAP_MAYMOVE = 1;
function RemapLarge(addr: pointer; old_len, new_len: size_t): pointer; inline;
begin
// let the Linux Kernel mremap() the memory using its TLB magic
result := pointer(do_syscall(syscall_nr_mremap, TSysParam(addr),
TSysParam(old_len), TSysParam(new_len), TSysParam(MREMAP_MAYMOVE)));
end;
{$endif FPCMM_NOMREMAP}
// experimental detection of object class - use at your own risk
{$define FPCMM_REPORTMEMORYLEAKS_EXPERIMENTAL}
// (untested on BSD/DARWIN)
{$else}
{$define FPCMM_NOMREMAP} // mremap is a Linux-specific syscall
{$endif LINUX}
procedure SwitchToThread;
var
t: Ttimespec;
begin
// note: nanosleep() adds a few dozen of microsecs for context switching
t.tv_sec := 0;
t.tv_nsec := 10; // empirically identified on a recent Linux Kernel
fpnanosleep(@t, nil);
end;
{$endif MSWINDOWS}
// fallback to safe and simple Alloc/Move/Free pattern
{$ifdef FPCMM_NOMREMAP}
function RemapLarge(addr: pointer; old_len, new_len: size_t): pointer;
begin
result := AllocLarge(new_len);
if new_len > old_len then
new_len := old_len; // resize down
Move(addr^, result^, new_len); // RTL non-volatile asm or our AVX MoveFast()
FreeMediumLarge(addr, old_len);
end;
{$endif FPCMM_NOMREMAP}
{ ********* Some Assembly Helpers }
// low-level conditional to disable nostackframe code on Linux
{$ifdef FPCMM_NOSFRAME}
{$undef NOSFRAME}
{$endif FPCMM_NOSFRAME}
var
HeapStatus: TMMStatus;
{$ifdef FPCMM_DEBUG}
procedure ReleaseCore;
{$ifdef NOSFRAME} nostackframe; {$endif} assembler;
asm
{$ifdef FPCMM_SLEEPTSC}
rdtsc // returns the TSC in EDX:EAX
shl rdx, 32
or rax, rdx
push rax
call SwitchToThread
pop rcx
rdtsc
shl rdx, 32
or rax, rdx
lea rdx, [rip + HeapStatus]
sub rax, rcx
lock add qword ptr [rdx + TMMStatus.SleepCycles], rax
{$else}
call SwitchToThread
lea rdx, [rip + HeapStatus]
{$endif FPCMM_SLEEPTSC}
lock inc qword ptr [rdx + TMMStatus.SleepCount]
end;
{$else}
procedure ReleaseCore;
begin
SwitchToThread;
inc(HeapStatus.SleepCount); // indicative counter
end;
{$endif FPCMM_DEBUG}
procedure NotifyArenaAlloc(var Arena: TMMStatusArena; Size: PtrUInt);
nostackframe; assembler;
asm
{$ifdef FPCMM_DEBUG}
lock add qword ptr [Arena].TMMStatusArena.CurrentBytes, Size
lock add qword ptr [Arena].TMMStatusArena.CumulativeBytes, Size
lock inc qword ptr [Arena].TMMStatusArena.CumulativeAlloc
mov rax, qword ptr [Arena].TMMStatusArena.CurrentBytes
cmp rax, qword ptr [Arena].TMMStatusArena.PeakBytes
jbe @s
mov qword ptr [Arena].TMMStatusArena.PeakBytes, rax
@s: {$else}
add qword ptr [Arena].TMMStatusArena.CurrentBytes, Size
add qword ptr [Arena].TMMStatusArena.CumulativeBytes, Size
{$endif FPCMM_DEBUG}
end;
procedure NotifyMediumLargeFree(var Arena: TMMStatusArena; Size: PtrUInt);
nostackframe; assembler;
asm
neg Size
{$ifdef FPCMM_DEBUG}
lock add qword ptr [Arena].TMMStatusArena.CurrentBytes, Size
lock inc qword ptr [Arena].TMMStatusArena.CumulativeFree
{$else}
add qword ptr [Arena].TMMStatusArena.CurrentBytes, Size
{$endif FPCMM_DEBUG}
end;
{ ********* Constants and Data Structures Definitions }
const
// (sometimes) the more arenas, the better multi-threadable
{$ifdef FPCMM_BOOSTER}
NumTinyBlockTypesPO2 = 4;
NumTinyBlockArenasPO2 = 4; // will probably end up with Medium lock contention
{$else}
{$ifdef FPCMM_BOOST}
NumTinyBlockTypesPO2 = 4; // tiny are <= 256 bytes
NumTinyBlockArenasPO2 = 3; // 8 arenas
{$else}
// default (or FPCMM_SERVER) settings
NumTinyBlockTypesPO2 = 3; // multiple arenas for tiny blocks <= 128 bytes
NumTinyBlockArenasPO2 = 3; // 8 round-robin arenas (including main) by default
{$endif FPCMM_BOOST}
{$endif FPCMM_BOOSTER}
NumSmallBlockTypes = 46;
MaximumSmallBlockSize = 2608;
SmallBlockSizes: array[0..NumSmallBlockTypes - 1] of word = (
16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256,
272, 288, 304, 320, 352, 384, 416, 448, 480, 528, 576, 624, 672, 736, 800,
880, 960, 1056, 1152, 1264, 1376, 1504, 1648, 1808, 1984, 2176, 2384,
MaximumSmallBlockSize, MaximumSmallBlockSize, MaximumSmallBlockSize);
NumTinyBlockTypes = 1 shl NumTinyBlockTypesPO2;
NumTinyBlockArenas = (1 shl NumTinyBlockArenasPO2) - 1; // -1 = main Small[]
NumSmallInfoBlock = NumSmallBlockTypes + NumTinyBlockArenas * NumTinyBlockTypes;
SmallBlockGranularity = 16;
TargetSmallBlocksPerPool = 48;
MinimumSmallBlocksPerPool = 12;
SmallBlockDownsizeCheckAdder = 64;
SmallBlockUpsizeAdder = 32;
{$ifdef FPCMM_LOCKLESSFREE}
SmallBlockTypePO2 = 8; // SizeOf(TSmallBlockType)=256 with Bin list
SmallBlockBinCount = (((1 shl SmallBlockTypePO2) - 64) div 8) - 1; // =23
{$else}
SmallBlockTypePO2 = 6; // SizeOf(TSmallBlockType)=64
{$endif FPCMM_LOCKLESSFREE}
MediumBlockPoolSizeMem = 20 * 64 * 1024;
MediumBlockPoolSize = MediumBlockPoolSizeMem - 16;
MediumBlockSizeOffset = 48;
MinimumMediumBlockSize = 11 * 256 + MediumBlockSizeOffset;
MediumBlockBinsPerGroup = 32;
MediumBlockBinGroupCount = 32;
MediumBlockBinCount = MediumBlockBinGroupCount * MediumBlockBinsPerGroup;
MediumBlockGranularity = 256;
MaximumMediumBlockSize =
MinimumMediumBlockSize + (MediumBlockBinCount - 1) * MediumBlockGranularity;
OptimalSmallBlockPoolSizeLowerLimit =
29 * 1024 - MediumBlockGranularity + MediumBlockSizeOffset;
OptimalSmallBlockPoolSizeUpperLimit =
64 * 1024 - MediumBlockGranularity + MediumBlockSizeOffset;
MaximumSmallBlockPoolSize =
OptimalSmallBlockPoolSizeUpperLimit + MinimumMediumBlockSize;
MediumInPlaceDownsizeLimit = MinimumMediumBlockSize div 4;
IsFreeBlockFlag = 1;
IsMediumBlockFlag = 2;
IsSmallBlockPoolInUseFlag = 4;
IsLargeBlockFlag = 4;
PreviousMediumBlockIsFreeFlag = 8;
LargeBlockIsSegmented = 8;
DropSmallFlagsMask = -8;
ExtractSmallFlagsMask = 7;
DropMediumAndLargeFlagsMask = -16;
ExtractMediumAndLargeFlagsMask = 15;
{$ifdef FPCMM_SLEEPTSC}
// pause using rdtsc (30 cycles latency on hardware but emulated on VM)
SpinMediumLockTSC = 10000;
SpinLargeLockTSC = 10000;
{$ifdef FPCMM_PAUSE}
SpinSmallGetmemLockTSC = 1000;
SpinSmallFreememLockTSC = 1000; // _freemem has more collisions
{$ifdef FPCMM_LOCKLESSFREE}
SpinSmallFreememBinTSC = 2000;
{$endif FPCMM_LOCKLESSFREE}
{$endif FPCMM_PAUSE}
{$else}
// pause with constant spinning counts (empirical values from fastmm4-avx)
SpinMediumLockCount = 2500;
SpinLargeLockCount = 5000;
{$ifdef FPCMM_PAUSE}
SpinSmallGetmemLockCount = 500;
SpinSmallFreememLockCount = 500;
{$ifdef FPCMM_LOCKLESSFREE}
SpinFreememBinCount = 500;
{$endif FPCMM_LOCKLESSFREE}
{$endif FPCMM_PAUSE}
{$endif FPCMM_SLEEPTSC}
{$ifdef FPCMM_ERMS}
// pre-ERMS expects at least 256 bytes, IvyBridge+ with ERMS is good from 64
// see https://stackoverflow.com/a/43837564/458259 for explanations and timing
// -> "movaps" loop is used up to 256 bytes of data: good on all CPUs
// -> "movnt" Move/MoveFast is used for large blocks: always faster than ERMS
ErmsMinSize = 256;
{$endif FPCMM_ERMS}
type
PSmallBlockPoolHeader = ^TSmallBlockPoolHeader;
// information for each small block size - 64/256 bytes long >= CPU cache line
TSmallBlockType = record
Locked: boolean;
AllowedGroupsForBlockPoolBitmap: byte;
BlockSize: Word;
MinimumBlockPoolSize: Word;
OptimalBlockPoolSize: Word;
NextPartiallyFreePool: PSmallBlockPoolHeader;
PreviousPartiallyFreePool: PSmallBlockPoolHeader;
NextSequentialFeedBlockAddress: pointer;
MaxSequentialFeedBlockAddress: pointer;
CurrentSequentialFeedPool: PSmallBlockPoolHeader;
GetmemCount: cardinal;
FreememCount: cardinal;
GetmemSleepCount: cardinal;
FreememSleepCount: cardinal;
{$ifdef FPCMM_LOCKLESSFREE} // 192 optional bytes for FreeMem Bin (= 13KB)
BinLocked: boolean; // dedicated lock for less contention
BinCount: byte;
BinSpinCount: cardinal;
BinInstance: array[0.. SmallBlockBinCount - 1] of pointer;
{$endif FPCMM_LOCKLESSFREE}
end;
PSmallBlockType = ^TSmallBlockType;
TSmallBlockTypes = array[0..NumSmallBlockTypes - 1] of TSmallBlockType;
TTinyBlockTypes = array[0..NumTinyBlockTypes - 1] of TSmallBlockType;
TSmallBlockInfo = record
Small: TSmallBlockTypes;
Tiny: array[0..NumTinyBlockArenas - 1] of TTinyBlockTypes;
GetmemLookup: array[0..
(MaximumSmallBlockSize - 1) div SmallBlockGranularity] of byte;
{$ifndef FPCMM_ASSUMEMULTITHREAD}
IsMultiThreadPtr: PBoolean; // safe access to IsMultiThread global variable
{$endif FPCMM_ASSUMEMULTITHREAD}
TinyCurrentArena: integer;
end;
TSmallBlockPoolHeader = record
BlockType: PSmallBlockType;
{$ifdef CPU32}
Padding32Bits: cardinal;
{$endif CPU32}
NextPartiallyFreePool: PSmallBlockPoolHeader;
PreviousPartiallyFreePool: PSmallBlockPoolHeader;
FirstFreeBlock: pointer;
BlocksInUse: cardinal;
SmallBlockPoolSignature: cardinal;
FirstBlockPoolPointerAndFlags: PtrUInt;
end;
PMediumBlockPoolHeader = ^TMediumBlockPoolHeader;
TMediumBlockPoolHeader = record
PreviousMediumBlockPoolHeader: PMediumBlockPoolHeader;
NextMediumBlockPoolHeader: PMediumBlockPoolHeader;
Reserved1: PtrUInt;
FirstMediumBlockSizeAndFlags: PtrUInt;
end;
PMediumFreeBlock = ^TMediumFreeBlock;
TMediumFreeBlock = record
PreviousFreeBlock: PMediumFreeBlock;
NextFreeBlock: PMediumFreeBlock;
end;
// medium locks occurs at getmem: freemem bin list got MaxCount=0 :(
{.$define FPCMM_LOCKLESSFREEMEDIUM}
{$ifdef FPCMM_LOCKLESSFREEMEDIUM}
const
MediumBlockLocklessBinCount = 255;
type
// used by TMediumBlockInfo to reduce thread pressure
TMediumLocklessBin = record
Locked: boolean; // dedicated lock for less contention
Count: byte;
MaxCount: byte;
Instance: array[0 .. MediumBlockLocklessBinCount - 1] of pointer;
end;
{$endif FPCMM_LOCKLESSFREEMEDIUM}
TMediumBlockInfo = record
Locked: boolean;
PoolsCircularList: TMediumBlockPoolHeader;
LastSequentiallyFed: pointer;
SequentialFeedBytesLeft: cardinal;
BinGroupBitmap: cardinal;
{$ifndef FPCMM_ASSUMEMULTITHREAD}
IsMultiThreadPtr: PBoolean; // safe access to IsMultiThread global variable
{$endif FPCMM_ASSUMEMULTITHREAD}
BinBitmaps: array[0..MediumBlockBinGroupCount - 1] of cardinal;
Bins: array[0..MediumBlockBinCount - 1] of TMediumFreeBlock;
{$ifdef FPCMM_LOCKLESSFREEMEDIUM}
LocklessBin: TMediumLocklessBin;
{$endif FPCMM_LOCKLESSFREEMEDIUM}
end;
PLargeBlockHeader = ^TLargeBlockHeader;
TLargeBlockHeader = record
PreviousLargeBlockHeader: PLargeBlockHeader;
NextLargeBlockHeader: PLargeBlockHeader;
Reserved: PtrUInt;
BlockSizeAndFlags: PtrUInt;
end;
const
BlockHeaderSize = SizeOf(pointer);
SmallBlockPoolHeaderSize = SizeOf(TSmallBlockPoolHeader);
MediumBlockPoolHeaderSize = SizeOf(TMediumBlockPoolHeader);
LargeBlockHeaderSize = SizeOf(TLargeBlockHeader);
LargeBlockGranularity = 65536;
var
SmallBlockInfo: TSmallBlockInfo;
MediumBlockInfo: TMediumBlockInfo;
SmallMediumBlockInfo: TMediumBlockInfo
{$ifdef FPCMM_SMALLNOTWITHMEDIUM} ;
{$else} absolute MediumBlockInfo;
{$endif FPCMM_SMALLNOTWITHMEDIUM}
LargeBlocksLocked: boolean;
LargeBlocksCircularList: TLargeBlockHeader;
{ ********* Shared Routines }
{$define FPCMM_CMPBEFORELOCK_SPIN}
// during spinning, there is clearly thread contention: in this case, plain
// "cmp" before "lock cmpxchg" is mandatory to leverage the CPU cores
procedure LockMediumBlocks;
{$ifdef NOSFRAME} nostackframe; {$endif} assembler;
// on input/output: r10=TMediumBlockInfo
asm
{$ifdef FPCMM_SLEEPTSC}
@s: rdtsc // tsc in edx:eax
shl rdx, 32
lea r9, [rax + rdx + SpinMediumLockTSC] // r9 = endtsc
@sp: pause
rdtsc
shl rdx, 32
or rax, rdx
cmp rax, r9
ja @rc // timeout
{$else}
@s: mov edx, SpinMediumLockCount
@sp: pause
dec edx
jz @rc //timeout
{$endif FPCMM_SLEEPTSC}
mov rcx, r10
mov eax, $100
{$ifdef FPCMM_CMPBEFORELOCK_SPIN}
cmp byte ptr [r10].TMediumBlockInfo.Locked, true
je @sp
{$endif FPCMM_CMPBEFORELOCK_SPIN}
lock cmpxchg byte ptr [rcx].TMediumBlockInfo.Locked, ah
je @ok
jmp @sp
@rc: push rsi // preserve POSIX and Win64 ABI registers
push rdi
push r10
push r11
call ReleaseCore
pop r11
pop r10
pop rdi
pop rsi
lea rax, [rip + HeapStatus]
{$ifdef FPCMM_DEBUG} lock {$endif}
inc qword ptr [rax].TMMStatus.Medium.SleepCount
jmp @s
@ok:
end;
procedure InsertMediumBlockIntoBin; nostackframe; assembler;