trinity/objects.c at master · kernelslacker/trinity · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include <limits.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include "arch.h"
#include "child.h"
#include "debug.h"
#include "deferred-free.h"
#include "fd.h"
#include "list.h"
#include "locks.h"
#include "maps.h"
#include "objects.h"
#include "params.h"
#include "pc_format.h"
#include "pids.h"
#include "random.h"
#include "rnd.h"
#include "shm.h"
#include "stats_ring.h"
#include "trinity.h"
#include "utils.h"

static struct list_head global_obj_list = { &global_obj_list, &global_obj_list };

/*
 * Per-type hard cap on parent_global_objects[].  High-volume providers
 * (sockets, bpf objs, ...) populated by REG_GLOBAL_OBJ init can balloon
 * an OBJ_GLOBAL pool to tens of thousands of entries pre-fork, which
 * (a) inflates every child's fork-time snapshot heap and (b) flattens
 * get_random_object()'s probability of revisiting any specific obj.
 * 4096 is comfortably above any pool we observe in steady state but low
 * enough to clamp pathological providers.
 */
#define OBJ_GLOBAL_MAX 4096

/*
 * Running count of OBJ_GLOBAL entries evicted by the hard-cap prune in
 * add_object().  Parent-private (the prune path runs only pre-fork, gated
 * by the mainpid guard above the OBJ_GLOBAL branch), so no atomic needed.
 * Surfaced under -v via the verbose output emitted on each prune event.
 */
static unsigned long obj_global_pruned;

/*
 * Parent-private OBJ_GLOBAL pool.  Populated pre-fork by every
 * REG_GLOBAL_OBJ provider via add_object(OBJ_GLOBAL); the per-child
 * snapshot in clone_global_objects_to_child() reads this array.
 * Lives in the parent's data segment, fork-COW'd into children whose
 * resolver (get_objhead) routes around it in favour of their own
 * private copy.
 */
static struct objhead parent_global_objects[MAX_OBJECT_TYPES];

/*
 * Parent-private fd->object hash and parallel compact live-fd list.
 * Same shape as the per-child snapshots; fd_hash_insert / fd_hash_remove
 * mutate these from the parent's pre-fork init and post-fork fd-event
 * drains.  Children read their own snapshots; the parent reads these
 * directly when servicing remove_object_by_fd() out of fd_event_drain().
 */
static struct fd_hash_entry parent_fd_hash[FD_HASH_SIZE];
static int parent_fd_live[FD_LIVE_MAX];
static unsigned int parent_fd_hash_count;
static unsigned int parent_fd_live_count;

void register_global_obj_init(struct global_obj_entry *entry)
{
	list_add_tail((struct list_head *) &entry->list, &global_obj_list);
}

void init_global_objects(void)
{
	struct list_head *pos;

	list_for_each(pos, &global_obj_list) {
		struct global_obj_entry *entry = (struct global_obj_entry *) pos;

		output(1, "Initializing %s objects.\n", entry->name);
		entry->init();
	}
}

/*
 * Hash table mapping fd → (object, type) for O(1) lookup in the
 * parent's remove_object_by_fd().  Open-addressing with linear
 * probing.  The parent's view sits in parent_fd_hash[]; each child
 * holds an independent snapshot in child->fd_hash[] populated by
 * clone_global_objects_to_child().
 */

void fd_hash_init(void)
{
	unsigned int i;

	for (i = 0; i < FD_HASH_SIZE; i++) {
		parent_fd_hash[i].fd = -1;
		parent_fd_hash[i].gen = 0;
	}
	parent_fd_hash_count = 0;
	/*
	 * fd_live[] entries are gated by fd_live_count, so initialising
	 * just the count is sufficient; stale slot contents past the
	 * count are never read.
	 */
	parent_fd_live_count = 0;
}

/*
 * Append fd to the parent's parallel live-fd list.  Called from
 * fd_hash_insert() after transitioning a slot from empty to occupied.
 * Single-writer (the parent); no cross-process coherence required.
 * Silently drops the entry if the cap is hit; the auditor that reads
 * via the per-child snapshot tolerates a missed fd.
 */
static void fd_live_append(int fd)
{
	unsigned int idx = parent_fd_live_count;

	if (idx >= FD_LIVE_MAX)
		return;

	parent_fd_live[idx] = fd;
	parent_fd_live_count = idx + 1;
}

/*
 * Swap-remove fd from the parent's parallel live-fd list.  Linear scan
 * over parent_fd_live[0..count); typical occupancy is a few hundred
 * entries so the cost is negligible.
 *
 * The "typical few hundred entries" comment is the very
 * thing a planned fd live-list index should be gated on confirming.
 * Bump a log2 histogram of the position the match lands at + a miss
 * counter so the "does the scan actually cost" question is
 * directly answerable from the periodic dump without a profile run.
 * Single-writer (parent) so RELAXED add-fetch is uniform with the
 * shm->stats convention rather than load-bearing for ordering.
 */
static void fd_live_remove(int fd)
{
	unsigned int count = parent_fd_live_count;
	unsigned int i;

	__atomic_add_fetch(&shm->stats.fd_live_remove_calls, 1, __ATOMIC_RELAXED);

	for (i = 0; i < count; i++) {
		unsigned int depth;
		unsigned int bucket;

		if (parent_fd_live[i] != fd)
			continue;

		if (i != count - 1)
			parent_fd_live[i] = parent_fd_live[count - 1];
		parent_fd_live_count = count - 1;

		/* Bucket index = floor(log2(depth)) + 1, with depth==0
		 * landing in bucket 0 (match-on-first-slot).  Saturates at
		 * the last bucket so >=64 collapses into one tail slot. */
		depth = i;
		if (depth == 0)
			bucket = 0;
		else {
			unsigned int lz = (unsigned int)__builtin_clz(depth);
			unsigned int hi_bit = 31u - lz;

			bucket = hi_bit + 1u;
			if (bucket >= ARRAY_SIZE(shm->stats.fd_live_remove_scan_histogram))
				bucket = ARRAY_SIZE(shm->stats.fd_live_remove_scan_histogram) - 1u;
		}
		__atomic_add_fetch(&shm->stats.fd_live_remove_scan_histogram[bucket],
				   1, __ATOMIC_RELAXED);
		return;
	}

	__atomic_add_fetch(&shm->stats.fd_live_remove_miss, 1, __ATOMIC_RELAXED);
}

static unsigned int fd_hash_slot(int fd)
{
	return (unsigned int) fd & (FD_HASH_SIZE - 1);
}

/*
 * Internal insert that preserves the entry's existing generation and
 * doesn't update fd_hash_count.  Used by fd_hash_remove to re-hash
 * displaced entries: the entry's identity is unchanged, only its slot.
 */
static void fd_hash_reinsert(int fd, struct object *obj, enum objecttype type,
			     uint32_t gen)
{
	unsigned int slot;
	unsigned int probe;

	slot = fd_hash_slot(fd);
	for (probe = 0; probe < FD_HASH_SIZE; probe++) {
		if (parent_fd_hash[slot].fd == -1)
			break;
		slot = (slot + 1) & (FD_HASH_SIZE - 1);
	}
	if (probe == FD_HASH_SIZE) {
		__atomic_add_fetch(&shm->stats.fd_hash_reinsert_dropped, 1,
				   __ATOMIC_RELAXED);
		outputerr("fd_hash_reinsert: table full, dropping fd %d\n", fd);
		return;
	}

	parent_fd_hash[slot].obj = obj;
	parent_fd_hash[slot].type = type;
	parent_fd_hash[slot].gen = gen;
	parent_fd_hash[slot].fd = fd;
}

bool fd_hash_insert(int fd, struct object *obj, enum objecttype type)
{
	unsigned int slot;

	if (fd < 0)
		return true;

	if (parent_fd_hash_count >= FD_HASH_SIZE)
		return false;

	slot = fd_hash_slot(fd);
	while (parent_fd_hash[slot].fd != -1 && parent_fd_hash[slot].fd != fd)
		slot = (slot + 1) & (FD_HASH_SIZE - 1);

	if (parent_fd_hash[slot].fd == -1) {
		parent_fd_hash_count++;
		fd_live_append(fd);
	}

	parent_fd_hash[slot].obj = obj;
	parent_fd_hash[slot].type = type;
	parent_fd_hash[slot].gen++;
	parent_fd_hash[slot].fd = fd;
	return true;
}

void fd_hash_remove(int fd)
{
	unsigned int slot, next, i;

	if (fd < 0)
		return;

	slot = fd_hash_slot(fd);
	for (i = 0; i < FD_HASH_SIZE; i++) {
		if (parent_fd_hash[slot].fd == -1)
			return;
		if (parent_fd_hash[slot].fd == fd) {
			parent_fd_hash[slot].gen++;
			parent_fd_hash[slot].fd = -1;
			fd_live_remove(fd);
			next = (slot + 1) & (FD_HASH_SIZE - 1);
			while (parent_fd_hash[next].fd != -1) {
				struct fd_hash_entry displaced = parent_fd_hash[next];
				parent_fd_hash[next].fd = -1;
				fd_hash_reinsert(displaced.fd, displaced.obj,
						 displaced.type, displaced.gen);
				next = (next + 1) & (FD_HASH_SIZE - 1);
			}
			parent_fd_hash_count--;
			return;
		}
		slot = (slot + 1) & (FD_HASH_SIZE - 1);
	}
}

void fd_hash_remove_local(int fd)
{
	struct childdata *child;
	struct fd_hash_entry *table;
	unsigned int slot, next, i;

	if (fd < 0)
		return;

	if (mypid() == mainpid)
		return;

	child = this_child();
	if (child == NULL || child->fd_hash == NULL)
		return;

	table = child->fd_hash;
	slot = fd_hash_slot(fd);
	for (i = 0; i < FD_HASH_SIZE; i++) {
		if (table[slot].fd == -1)
			return;
		if (table[slot].fd == fd) {
			table[slot].gen++;
			table[slot].fd = -1;
			next = (slot + 1) & (FD_HASH_SIZE - 1);
			while (table[next].fd != -1) {
				struct fd_hash_entry displaced = table[next];
				unsigned int rs;

				table[next].fd = -1;
				rs = fd_hash_slot(displaced.fd);
				while (table[rs].fd != -1 &&
				       table[rs].fd != displaced.fd)
					rs = (rs + 1) & (FD_HASH_SIZE - 1);
				table[rs] = displaced;
				next = (next + 1) & (FD_HASH_SIZE - 1);
			}
			return;
		}
		slot = (slot + 1) & (FD_HASH_SIZE - 1);
	}
}

void fd_hash_remove_local_range(int lo, int hi)
{
	struct childdata *child;
	struct fd_hash_entry *table;
	unsigned int i;

	if (lo > hi)
		return;

	child = this_child();
	if (child == NULL || child->fd_hash == NULL)
		return;
	table = child->fd_hash;

	/*
	 * One walk over the local hash table, evicting every slot whose
	 * fd is in [lo, hi].  Replaces the prior fd-by-fd loop that paid
	 * an FD_HASH_SIZE-bounded linear probe per fd in the range --
	 * O(N*M) for close_range(lo=3, hi=1024) collapses to O(M).
	 *
	 * fd_hash_remove_local() walks the displacement chain after the
	 * evicted slot and re-hashes any entries it finds; a re-hashed
	 * entry can land back into the slot we just cleared (its natural
	 * slot may map there) but never into a slot earlier than the one
	 * we removed from -- the probe-from-natural walk always finds the
	 * just-emptied slot before any wrap-around landing site.  i--
	 * therefore re-examines this slot (which may now hold a different
	 * fd, possibly itself in [lo, hi]) without revisiting anything
	 * we've already cleared.
	 */
	for (i = 0; i < FD_HASH_SIZE; i++) {
		if (table[i].fd >= lo && table[i].fd <= hi) {
			fd_hash_remove_local(table[i].fd);
			i--;
		}
	}
}

struct fd_hash_entry *fd_hash_lookup(int fd)
{
	struct fd_hash_entry *table;
	unsigned int slot, i;

	if (fd < 0)
		return NULL;

	/*
	 * Children resolve against their fork-time snapshot of the
	 * parent's table; the parent resolves against its own writer
	 * view.  Fall back to the parent view in the early init_child
	 * window where the snapshot has not yet been allocated.
	 */
	if (mypid() == mainpid) {
		table = parent_fd_hash;
	} else {
		struct childdata *child = this_child();

		table = (child != NULL && child->fd_hash != NULL)
			? child->fd_hash : parent_fd_hash;
	}

	slot = fd_hash_slot(fd);
	for (i = 0; i < FD_HASH_SIZE; i++) {
		int slot_fd = table[slot].fd;

		if (slot_fd == -1)
			return NULL;
		if (slot_fd == fd)
			return &table[slot];
		slot = (slot + 1) & (FD_HASH_SIZE - 1);
	}
	return NULL;
}

static bool is_fd_type(enum objecttype type)
{
	return type >= OBJ_FD_PIPE && type <= OBJ_FD_SCRATCH_BLOCK;
}

/*
 * Per-objhead fd→object hash for OBJ_LOCAL fd-typed pools.
 *
 * Open-addressing with linear probing into a fixed power-of-two slot array
 * (LOCAL_FD_HASH_SIZE).  fd == -1 marks empty.  The table lives in the
 * owning child's private heap — head->fd_hash itself sits in shm alongside
 * the rest of the objhead, but the buffer it points at is per-process and
 * unreachable from any other address space, the same shape head->array
 * uses for OBJ_LOCAL pools allocated via get_objhead(OBJ_LOCAL).
 *
 * Replaces the O(n) linear walk over head->array in
 * find_local_object_by_fd() with a single hash probe.  That function is
 * called from register_returned_fd() on every successful RET_FD syscall
 * whose entry->ret_objtype is not OBJ_NONE (open, openat, socket, accept,
 * eventfd, timerfd, perf_event_open, io_uring_setup, memfd_create,
 * pidfd, fanotify_init, etc.), so the saving applies on the syscall hot
 * path with head->num_entries typically in the tens-to-low-hundreds.
 */
static unsigned int local_fd_hash_slot_idx(int fd)
{
	return (unsigned int)fd & (LOCAL_FD_HASH_SIZE - 1);
}

static void local_fd_hash_alloc(struct objhead *head)
{
	unsigned int i;

	head->fd_hash = malloc(LOCAL_FD_HASH_SIZE *
			       sizeof(struct local_fd_hash_slot));
	if (head->fd_hash == NULL)
		return;
	for (i = 0; i < LOCAL_FD_HASH_SIZE; i++) {
		head->fd_hash[i].fd = -1;
		head->fd_hash[i].obj = NULL;
	}
}

/*
 * Internal insert that does not check for an existing entry — used by
 * local_fd_hash_remove() to re-seat displaced entries after a removal.
 * The displaced entry's identity is unchanged, so the original (fd, obj)
 * pair is reinserted unconditionally into the first empty slot.
 */
static void local_fd_hash_reinsert(struct objhead *head, int fd,
				   struct object *obj)
{
	unsigned int slot, probe;

	slot = local_fd_hash_slot_idx(fd);
	for (probe = 0; probe < LOCAL_FD_HASH_SIZE; probe++) {
		if (head->fd_hash[slot].fd == -1) {
			head->fd_hash[slot].fd = fd;
			head->fd_hash[slot].obj = obj;
			return;
		}
		slot = (slot + 1) & (LOCAL_FD_HASH_SIZE - 1);
	}
}

static void local_fd_hash_insert(struct objhead *head, int fd,
				 struct object *obj)
{
	unsigned int slot, probe;

	if (fd < 0)
		return;
	if (head->fd_hash == NULL) {
		local_fd_hash_alloc(head);
		if (head->fd_hash == NULL)
			return;
	}

	slot = local_fd_hash_slot_idx(fd);
	for (probe = 0; probe < LOCAL_FD_HASH_SIZE; probe++) {
		if (head->fd_hash[slot].fd == -1 ||
		    head->fd_hash[slot].fd == fd) {
			head->fd_hash[slot].fd = fd;
			head->fd_hash[slot].obj = obj;
			return;
		}
		slot = (slot + 1) & (LOCAL_FD_HASH_SIZE - 1);
	}
	/*
	 * Table saturated.  Realistically unreachable — LOCAL_FD_HASH_SIZE
	 * sits well above any per-(child, type) pool we have observed —
	 * but if it ever happens the caller gracefully falls back to the
	 * uninserted state: find_local_object_by_fd() returns NULL and
	 * register_returned_fd() simply re-adds, which is the same outcome
	 * as the pre-hash linear walk missing the entry.  Bump a stat so
	 * the silent drop is observable in the end-of-run summary.
	 */
	__atomic_add_fetch(&shm->stats.local_fd_hash_insert_dropped, 1,
			   __ATOMIC_RELAXED);
}

static void local_fd_hash_remove(struct objhead *head, int fd)
{
	unsigned int slot, next, i;

	if (fd < 0 || head->fd_hash == NULL)
		return;

	slot = local_fd_hash_slot_idx(fd);
	for (i = 0; i < LOCAL_FD_HASH_SIZE; i++) {
		if (head->fd_hash[slot].fd == -1)
			return;
		if (head->fd_hash[slot].fd == fd) {
			head->fd_hash[slot].fd = -1;
			head->fd_hash[slot].obj = NULL;
			/*
			 * Linear-probing removal: re-seat any entries in the
			 * chain following us so a later lookup that hashes
			 * past this newly-empty slot still finds them.
			 */
			next = (slot + 1) & (LOCAL_FD_HASH_SIZE - 1);
			while (head->fd_hash[next].fd != -1) {
				struct local_fd_hash_slot displaced =
					head->fd_hash[next];
				head->fd_hash[next].fd = -1;
				head->fd_hash[next].obj = NULL;
				local_fd_hash_reinsert(head, displaced.fd,
						       displaced.obj);
				next = (next + 1) & (LOCAL_FD_HASH_SIZE - 1);
			}
			return;
		}
		slot = (slot + 1) & (LOCAL_FD_HASH_SIZE - 1);
	}
}

static struct object *local_fd_hash_lookup(struct objhead *head, int fd)
{
	unsigned int slot, i;

	if (fd < 0 || head->fd_hash == NULL)
		return NULL;

	slot = local_fd_hash_slot_idx(fd);
	for (i = 0; i < LOCAL_FD_HASH_SIZE; i++) {
		if (head->fd_hash[slot].fd == -1)
			return NULL;
		if (head->fd_hash[slot].fd == fd)
			return head->fd_hash[slot].obj;
		slot = (slot + 1) & (LOCAL_FD_HASH_SIZE - 1);
	}
	return NULL;
}

/*
 * Every obj struct comes from alloc_object() (zmalloc) and lives in
 * the allocating process's private heap.  OBJ_GLOBAL pools are
 * populated pre-fork in the parent, then fork-COW'd into children's
 * snapshots; OBJ_LOCAL pools are wholly per-child.  No path crosses
 * the shared mapping for obj storage.
 */
struct object * alloc_object(void)
{
	heap_brk_maybe_refresh();
	return zmalloc_tracked(sizeof(struct object));
}

/*
 * Release an obj struct.  Routed through deferred_free_enqueue()
 * rather than free()'d immediately so a stale slot pointer that
 * survived past __destroy_object() lands on a chunk with a 5-50
 * syscall TTL (effective 80-800 with DEFERRED_TICK_BATCH) instead
 * of glibc-reclaimed memory: get_map() and friends read &obj->map
 * after taking the slot pointer out of head->array, and the arg-gen
 * path that invoked get_map() can hold the pointer across the
 * window in which the slot's owner destroys the obj.
 *
 * Zero the chunk before handing it to the deferred-free ring so a
 * post-destroy read (via a stale slot pointer) trips the size==0
 * band of consumer sanity checks instead of dereferencing an obj
 * whose name string or mmap pointer was already torn down by the
 * destructor.
 */
static void release_obj(struct object *obj,
			enum obj_scope scope __attribute__((unused)),
			enum objecttype type __attribute__((unused)))
{
	memset(obj, 0, sizeof(*obj));
	deferred_free_enqueue(obj);
}

struct objhead * get_objhead(enum obj_scope scope, enum objecttype type)
{
	struct objhead *head;

	if (scope == OBJ_GLOBAL) {
		/*
		 * Children resolve against their fork-time snapshot of the
		 * parent's pre-fork pool (allocated by
		 * clone_global_objects_to_child).  The parent's writer view
		 * lives in parent_global_objects[] in this file.
		 *
		 * Children NEVER fall back to the parent view: a child reader
		 * indexing the parent's live head->array escapes the snapshot
		 * the OBJ_GLOBAL contract pins them to (post-fork parent grows
		 * are supposed to be invisible) AND the parent's array may sit
		 * on a heap chunk the parent has since freed and replaced via
		 * the deferred-free hand-off in add_object_grow_capacity().
		 * The child's COW page captured the pre-replacement pointer
		 * value; the indexed read off it lands inside a recycled chunk
		 * (the UAF this fix addresses).  Return NULL instead so any
		 * child whose snapshot did not complete (early init, snapshot
		 * alloc failure) gracefully takes the "empty pool" branch
		 * rather than dereferencing the wrong address space's
		 * bookkeeping.
		 */
		if (mypid() != mainpid) {
			struct childdata *child = this_child();

			if (child == NULL || child->global_objects == NULL)
				return NULL;
			return &child->global_objects[type];
		}
		head = &parent_global_objects[type];
	} else {
		struct childdata *child;

		child = this_child();
		if (child == NULL)
			return NULL;
		head = &child->objects[type];
	}
	return head;
}


/*
 * Snapshot helper for the for_each_obj iterator macro.  Captures
 * num_entries and array into the caller's state struct so the loop
 * body operates on a per-invocation hoist rather than re-loading
 * head fields on every iteration.  No cross-process coherence is
 * required post-Stage-5 — every pool lives in the iterating
 * process's private heap.
 */
void __for_each_obj_init(struct objhead *head,
			 struct __for_each_obj_state *s)
{
	s->n_snap = head->num_entries;
	s->array_snap = head->array;

	if (s->array_snap == NULL)
		s->n_snap = 0;
}

/*
 * Global object array backing storage.  Allocated via __zmalloc (plain
 * malloc), so the buffer lives in the parent's PRIVATE heap and is
 * fork-COW'd into every child rather than shared MAP_SHARED.  Children
 * do not read the parent's view directly post-fork: get_objhead()
 * routes them to their own snapshot (clone_global_objects_to_child())
 * and returns NULL when the snapshot is missing, so the COW divergence
 * between the parent's live head->array and the child's frozen view
 * never reaches an indexed read.
 */
/*
 * Up-front input validation for add_object().  Three rejections,
 * all cheaper than the slot-resolution / grow / publish work that
 * follows -- if any of them fires we release the obj back to the
 * deferred-free ring and tell the caller to bail without ever
 * touching the per-type pool:
 *
 *   - the verbose-mode caller trace (gated on -vv, used when
 *     attributing churn back to a specific .post handler),
 *   - the fd-bound rejection check for fd-typed objects (any
 *     value past NR_OPEN is upper-bit corruption that the loose
 *     "(long)retval >= 0" gate in register_returned_fd / the
 *     per-syscall .post handlers let through),
 *   - the OBJ_GLOBAL post-fork guard (OBJ_GLOBAL is pre-fork-only
 *     by construction; a child that reached add_object(OBJ_GLOBAL)
 *     would mutate only its private copy with no benefit).
 *
 * obj->obj_type is stamped between the fd-bound gate and the
 * post-fork guard so the tag is set exactly once on the success
 * path; release_obj()'s memset zeroes it back to OBJ_NONE on the
 * failure paths.
 *
 * The caller_pc parameter is the captured __builtin_return_address(0)
 * from add_object()'s entry, threaded in so the verbose trace and
 * the bad-fd outputerr / post_handler_corrupt_ptr_bump_site PC
 * captures still name the real caller of add_object() rather than
 * this helper's frame.
 *
 * is_fd / fd are hoisted by add_object() from a single is_fd_type()
 * + fd_from_object() pair at function entry and threaded through to
 * here (and onward into the grow / publish helpers) so the same
 * inputs aren't re-resolved 3-4x per fd-returning syscall.  Pure
 * CSE -- the obj's fd union member is not written by any add_object
 * path, so any later re-read would return identical bytes.
 *
 * Returns true if the obj was rejected (release_obj already
 * called -- add_object() must return immediately); false if
 * validation passed and the slot-resolution / grow / publish
 * phases should run.
 */
static bool add_object_validate(struct object *obj, enum obj_scope scope,
				enum objecttype type, void *caller_pc,
				bool is_fd, int fd)
{
	char pcbuf[128];

	if (unlikely(verbosity > 1)) {
		output(2, "ADD-OBJ slot=%p type=%d caller=%s\n", obj, type,
			pc_to_string(caller_pc, pcbuf, sizeof(pcbuf)));
	}

	/*
	 * Reject obviously-corrupted fd values before they enter any pool.
	 * 1<<20 = 1048576 matches the kernel's NR_OPEN ceiling
	 * (include/uapi/linux/fs.h), the absolute upper bound RLIMIT_NOFILE
	 * may be raised to on every distro we exercise -- so any retval
	 * decoding to a value past this is a smoking-gun upper-bit
	 * corruption (sign-extended or wholesale-stomped rec->retval) that
	 * the existing "(long)retval >= 0" gate in register_returned_fd /
	 * the per-syscall .post handlers let through because the lower bits
	 * happened to be positive.
	 */
	if (is_fd && (fd < 0 || fd >= (1 << 20))) {
		outputerr("add_object: rejecting out-of-bound fd=%d "
			  "type=%u caller=%s\n", fd, type,
			  pc_to_string(caller_pc,
				       pcbuf, sizeof(pcbuf)));
		post_handler_corrupt_ptr_bump_site(NULL,
						   caller_pc,
						   "add_object:fd");
		release_obj(obj, scope, type);
		return true;
	}

	/*
	 * Stamp the pool tag now that the obj has passed the fd-bound
	 * gate and is about to enter a pool.  Read back by
	 * objpool_check() in consumers (the post-2026-05-18 audit sweep
	 * across fds/ + syscalls/keyctl.c + childops/kvm-run-churn.c)
	 * to catch wild-obj-pointer derefs the loose 47-bit VA-range
	 * shape check lets through.  release_obj()'s memset zeroes the
	 * chunk on the way back to the deferred-free ring, which
	 * naturally invalidates the tag to OBJ_NONE for any future
	 * stale-pointer reader.
	 */
	obj->obj_type = type;

	/*
	 * OBJ_GLOBAL is pre-fork-only by construction: every provider
	 * REG_GLOBAL_OBJ init runs in the parent before fork_children(),
	 * and the per-child snapshot is taken at fork time.  A post-fork
	 * child that reached add_object(OBJ_GLOBAL) would mutate only its
	 * private copy with no benefit, so route the call to nowhere.
	 */
	if (scope == OBJ_GLOBAL && mypid() != mainpid) {
		release_obj(obj, scope, type);
		return true;
	}

	return false;
}

/*
 * Grow head->array if the next slot is past current capacity.  head
 * is resolved once in add_object() and threaded through; same for
 * the hoisted is_fd / fd pair used by the leak-close error paths.
 *
 * The alloc-track LRU slot for the live head->array container is
 * refreshed before the grow check so the upcoming
 * deferred_free_enqueue(oldarray) doesn't reject on an alloc_track
 * miss after thousands of intervening zmalloc_tracked calls in
 * cap>=1024 pools.  An alloc_track miss would leak the old chunk
 * rather than UAF it, but still silently bypasses the deferred-free
 * path the indexed-read correctness model relies on.
 *
 * Both scopes use the same allocate-copy-defer-free shape: a fresh
 * zmalloc_tracked container, memcpy the live slots over, publish
 * head->array + array_capacity, bump array_generation, then
 * deferred_free_enqueue(oldarray).  The deferred-free TTL (5-50
 * syscalls, effective 80-800 with DEFERRED_TICK_BATCH) keeps the
 * old chunk readable across any in-flight reader's snapshot
 * through objhead_indexed_read() -- without it, the same process
 * can re-enter the picker during arg-gen, hold a cached
 * head->array snapshot across the grow, and UAF the freed
 * container.  Same hazard shape as the obj-struct deferred-free
 * path: a live container freed underneath a cached reader is a
 * use-after-free.
 *
 * OBJ_GLOBAL needs the same deferral as OBJ_LOCAL even though the
 * writer is single (parent pre-fork only): the parent itself reads
 * its own pre-fork OBJ_GLOBAL pool during arg-gen, so single-writer
 * does not imply single-reader.  This is single-process re-entrancy,
 * not cross-thread.
 *
 * Both branches cap-overflow-guard at UINT_MAX / 2.  On either the
 * overflow or the malloc-failure path: close any leaked fd,
 * release_obj() the inbound obj, and tell the caller to bail.
 *
 * Returns true if the grow failed (release_obj already called --
 * add_object() must return immediately); false if either no grow
 * was needed or the grow succeeded and the publish phase should run.
 */
static bool add_object_grow_capacity(struct object *obj, enum obj_scope scope,
				     enum objecttype type, struct objhead *head,
				     bool is_fd, int fd)
{
	unsigned int n, cap;

	n = head->num_entries;
	cap = head->array_capacity;

	/*
	 * Refresh head->array's alloc_track LRU slot before the grow
	 * check below.  Inter-grow windows on cap>=1024 pools span
	 * thousands of intervening zmalloc_tracked calls -- without this
	 * refresh the live container ages out of the 4096-slot ring and
	 * the next grow's deferred_free_enqueue(oldarray) rejects on
	 * alloc_track miss (leak, not UAF, but still silently bypasses
	 * the deferred-free path the indexed-read correctness model
	 * relies on).  Same pattern as the clone_global_mmap_pool
	 * dedup-skip refresh: any long-lived container must be revived
	 * with alloc_track_refresh() before it can be deferred-freed.
	 * Both scopes alloc via zmalloc_tracked so the refresh applies
	 * uniformly; the NULL guard skips the first grow (empty pool).
	 */
	if (head->array != NULL)
		alloc_track_refresh(head->array);

	if (scope == OBJ_GLOBAL) {
		if (n >= cap) {
			/*
			 * Grow on the parent's private heap.  Single-writer
			 * (parent pre-fork only) but NOT single-reader: the
			 * parent re-enters get_random_object() during its own
			 * arg-gen and can hold a cached head->array snapshot
			 * across this grow.  An immediate free of the old
			 * container would UAF the in-flight indexed-read.
			 * Use the same allocate-copy-defer-free shape as the
			 * OBJ_LOCAL branch below; the deferred-free TTL keeps
			 * the old chunk readable across any reader's window.
			 */
			struct object **newarray;
			struct object **oldarray;
			unsigned int newcap = cap ? cap * 2 : 16;

			if (cap > UINT_MAX / 2) {
				outputerr("add_object: cap overflow type=%u num_entries=%u capacity=%u\n",
					  type, n, cap);
				if (is_fd && fd >= 0)
					close(fd);
				release_obj(obj, scope, type);
				return true;
			}
			newarray = zmalloc_tracked(newcap * sizeof(struct object *));
			if (newarray == NULL) {
				outputerr("add_object: malloc failed for type %u (cap %u)\n",
					  type, newcap);
				if (is_fd && fd >= 0)
					close(fd);
				release_obj(obj, scope, type);
				return true;
			}
			oldarray = head->array;
			if (oldarray != NULL && cap > 0)
				memcpy(newarray, oldarray,
				       cap * sizeof(struct object *));
			head->array = newarray;
			head->array_capacity = newcap;
			/*
			 * Bump before the deferred-free hand-off so any reader
			 * whose snapshot raced this grow re-reads the new
			 * generation and drops the pick rather than indexing
			 * the (now-ttl'd) old container.  Pool-private
			 * single-writer (parent pre-fork on OBJ_GLOBAL, owning
			 * child on OBJ_LOCAL), so an unlocked bump is
			 * sufficient.  See objhead_indexed_read().
			 */
			head->array_generation++;
			if (oldarray != NULL)
				deferred_free_enqueue(oldarray);
		}
	} else if (n >= cap) {
		/*
		 * OBJ_LOCAL grow on the owning child's private heap.  Use
		 * the same allocate-copy-defer-free shape that closed the
		 * UAF on the array container reachable through cached
		 * head->array reads in the arg-gen path: the deferred-free
		 * ring gives the old chunk a 5-50 syscall (effective
		 * 80-800 with DEFERRED_TICK_BATCH) TTL, far longer than
		 * any in-flight reader's window.  Same hazard shape as
		 * the obj-struct deferred-free path: freeing a live
		 * container underneath a cached reader is a use-after-free.
		 */
		struct object **newarray;
		struct object **oldarray;
		unsigned int newcap = cap ? cap * 2 : 16;

		if (cap > UINT_MAX / 2) {
			outputerr("add_object: cap overflow type=%u num_entries=%u capacity=%u\n",
				  type, n, cap);
			if (is_fd && fd >= 0)
				close(fd);
			release_obj(obj, scope, type);
			return true;
		}
		newarray = zmalloc_tracked(newcap * sizeof(struct object *));
		if (newarray == NULL) {
			outputerr("add_object: malloc failed for type %u (cap %u)\n",
				  type, newcap);
			if (is_fd && fd >= 0)
				close(fd);
			release_obj(obj, scope, type);
			return true;
		}
		oldarray = head->array;
		if (oldarray != NULL && cap > 0)
			memcpy(newarray, oldarray, cap * sizeof(struct object *));
		head->array = newarray;
		head->array_capacity = newcap;
		/*
		 * Bump before the deferred-free hand-off so any reader whose
		 * snapshot raced this grow re-reads the new generation and
		 * drops the pick rather than indexing the (now-ttl'd) old
		 * container.  See objhead_indexed_read().
		 */
		head->array_generation++;
		if (oldarray != NULL)
			deferred_free_enqueue(oldarray);
	}

	return false;
}

/*
 * Publish the inbound obj into its resolved slot and run the
 * post-publish bookkeeping: scope-conditional fd-hash registration
 * (with rollback on OBJ_GLOBAL hash-full), the verbose-mode
 * per-object dump, and the LOCAL / GLOBAL prune calls that keep
 * the pool within its steady-state ceiling.
 *
 * Stamp ordering inside the publish block is slot-array first,
 * then array_idx, then the monotonic slot_version tag, then the
 * publish-time fleet op tick, then the head->num_entries bump
 * last -- any consumer that re-reads obj fields off head->array
 * sees a fully-populated obj as soon as num_entries admits it.
 *
 * head / is_fd / fd are resolved once in add_object() and threaded
 * through, so this function does not re-enter get_objhead(),
 * is_fd_type() or fd_from_object() -- behavior-preserving CSE on a
 * hot path.
 *
 * OBJ_GLOBAL fd_hash registration is the only failure path: a
 * fd_hash_insert() reject means the parent's global fd_hash is
 * full -- we roll back the just-published slot (drop num_entries
 * back, NULL the array slot), close the fd that would otherwise
 * leak, release_obj() the inbound obj, and return internally.  No
 * further work follows the publish in the caller.
 */
static void add_object_publish(struct object *obj, enum obj_scope scope,
			       enum objecttype type, struct objhead *head,
			       bool is_fd, int fd)
{
	unsigned int n;

	n = head->num_entries;

	head->array[n] = obj;
	obj->array_idx = n;
	/*
	 * Stamp the per-pool monotonic identity tag.  Pre-increment so
	 * the first issued value is 1; the zero left by release_obj()'s
	 * memset on a freed obj is reserved as a never-issued sentinel.
	 * Stamped after the slot-array insert and the array_idx assign
	 * so any consumer that re-reads obj fields off head->array sees
	 * a fully populated obj as soon as num_entries below admits it.
	 */
	obj->slot_version = ++head->next_slot_version;
	/*
	 * Stamp the publish-time fleet op tick from the child-readable
	 * mirror page.  parent_stats.op_count is MAP_PRIVATE heap so
	 * a child COW-copy goes stale immediately after fork; the
	 * shm_published mirror is the republished, child-visible copy
	 * of the same counter.  No current reader -- pre-stage field
	 * for the upcoming diag-drain consumer.  RELAXED matches the
	 * parent's __atomic_store_n in stats_publish_locked(); a plain
	 * child read racing the parent's atomic write of the same shm
	 * word is a C11 data race.
	 */
	obj->publish_call_nr = shm_published
	      ? __atomic_load_n(&shm_published->fleet_op_count, __ATOMIC_RELAXED)
	      : 0;
	head->num_entries = n + 1;

	/*
	 * Maintain the per-child OBJ_LOCAL OBJ_MMAP_* nonempty-pool mask
	 * that get_map_handle() uses to skip guaranteed-empty pools.  This
	 * publish is the 0->1 transition iff the pre-publish n was zero --
	 * any larger n means the bit is already set.  Only the three mmap
	 * pool types participate; mmap_pool_bit_for_type() returns -1 for
	 * everything else and the branch is skipped.  OBJ_GLOBAL is
	 * parent-only by construction (see add_object_validate's post-fork
	 * guard) and the mask lives in childdata, so the maintenance is
	 * gated on scope == OBJ_LOCAL.
	 */
	if (scope == OBJ_LOCAL && n == 0) {
		int bit = mmap_pool_bit_for_type(type);