// 0. Memory barriers in practice for software engineers // // Taylor R Campbell, riastradh@NetBSD.org // 2022-09-17 // EuroBSDcon 2022, Vienna, Austria // // TL;DR: Memory barriers come in pairs to order FOUR memory operations: /* cpu0 */ /* cpu1 */ A(); membar_release(); *p = ...; x = *p; membar_acquire(); B(x); A(); atomic_store_release(p, ...); x = atomic_load_acquire(p); B(x); // GUARANTEE: if cpu0 store at p is witnessed by cpu1 load at p, // then A happens before B // // optimization: membar_producer and membar_consumer if cpu1 is read-only // // RED FLAG: store-before-load (exotic protocols like Dekker's algorithm) // https://NetBSD.org/gallery/presentations/riastradh/eurobsdcon2022/membars.txt // 1. Simple counter unsigned long long C; void count(void) { C += 1; } // 2. Counter with explicit load and store unsigned long long C; void count(void) { unsigned long long t; t = C; t += 1; C = t; } // 3. Counter with lock using atomic test-and-set unsigned long long C; int L; /* 0 when free, 1 when locked */ static int test_and_set(int *p) { int x = 1; asm("xchgl %0, %1" : "+r"(x), "+m"(*p)); /* XCHG is atomic */ return x; } void count(void) { unsigned long long t; while (test_and_set(&L)) /* Wait until unlocked, and lock it. */ continue; t = C; /* Critical section. */ t += 1; C = t; L = 0; /* Unlock it. */ } // 4. Problem: Compiler can reorder without changing sequential semantics unsigned long long C; int L; /* 0 when free, 1 when locked */ static int test_and_set(int *p) { int x = 1; asm("xchgl %0, %1" : "+r"(x), "+m"(*p)); /* XCHG is atomic */ return x; } void count(void) { unsigned long long t; while (test_and_set(&L)) /* Wait until unlocked, and lock it. */ continue; L = 0; /* Unlock it. */ t = C; /* Critical section ??? */ t += 1; C = t; } // 5. Counter with lock using atomic test-and-set unsigned long long C; int L; /* 0 when free, 1 when locked */ static int test_and_set(int *p) { int x = 1; asm("xchgl %0, %1" : "+r"(x), "+m"(*p)); /* XCHG is atomic */ return x; } void count(void) { unsigned long long t; while (test_and_set(&L)) /* Wait until unlocked, and lock it. */ continue; t = C; /* Critical section. */ t += 1; C = t; L = 0; /* Unlock it. */ } // 6. Problem: Compiler can reorder without changing sequential semantics 0000000000000000 : 0: 90000000 adrp x0, 0 0: R_AARCH64_ADR_PREL_PG_HI21 .bss 4: 91000000 add x0, x0, #0x0 4: R_AARCH64_ADD_ABS_LO12_NC .bss 8: 52800021 mov w1, #0x1 // while (test_and_set(&L)) ... c: 885f7c02 ldxr w2, [x0] 10: 88037c01 stxr w3, w1, [x0] 14: 34ffffc3 cbz w3, c 18: 35ffffa2 cbnz w2, c 1c: f9400401 ldr x1, [x0, #8] // t = C 20: b900001f str wzr, [x0] // L = 0 24: 91000421 add x1, x1, #0x1 // t += 1 28: f9000401 str x1, [x0, #8] // C = t 2c: d65f03c0 ret // 7. Instruction barrier -- C code void count(void) { unsigned long long t; while (test_and_set(&L)) /* Wait until unlocked, and lock it. */ continue; asm volatile("" ::: "memory"); t = C; t += 1; C = t; asm volatile("" ::: "memory"); L = 0; /* Unlock it. */ } // 8. Instruction barrier -- machine instructions 0000000000000000 : 0: 90000000 adrp x0, 0 0: R_AARCH64_ADR_PREL_PG_HI21 .bss 4: 91000000 add x0, x0, #0x0 4: R_AARCH64_ADD_ABS_LO12_NC .bss 8: 52800021 mov w1, #0x1 // while (test_and_set(&L)) ... c: 885f7c02 ldxr w2, [x0] 10: 88037c01 stxr w3, w1, [x0] 14: 34ffffc3 cbz w3, c 18: 35ffffa2 cbnz w2, c 1c: f9400401 ldr x1, [x0, #8] // t = C 20: 91000421 add x1, x1, #0x1 // t += 1 24: f9000401 str x1, [x0, #8] // C = t 28: b900001f str wzr, [x0] // L = 0 2c: d65f03c0 ret // 9. CPU cores -> shared memory +----------+ | compiler |----------------------------. +----------+ \ \ / | \ \ machine instructions | | | | V V V V +------+ +------+ +------+ +------+ | cpu0 | | cpu1 | | cpu2 | | cpu3 | +------+ +------+ +------+ +------+ \ | / / | | / / V V / / memory operations +---------------+ / / | shared memory |<--------------------- +---------------+ // 10. CPU cores -> shared memory (with cache interconnect) +----------+ | compiler |----------------------------. +----------+ \ \ / | \ \ machine instructions | | | | V V V V +------+ +------+ +------+ +------+ | cpu0 | <====> | cpu1 | <====> | cpu2 | <====> | cpu3 | +------+ +------+ +------+ +------+ \ | / / | | / / V V / / memory operations +---------------+ / / | shared memory |<--------------------- +---------------+ // 11. CPU cores -> shared memory (with cache interconnect) +----------+ | compiler |----------------------------. +----------+ \ \ / | \ \ machine instructions | | | | V V V V +------+ +------+ +------+ +------+ | cpu0 | <====> | cpu1 | <====> | cpu2 | <====> | cpu3 | +------+ +------+ +------+ +------+ \ | / / | | / / V V / / memory operations +---------------+ / / | shared memory |<--------------------- +---------------+ // Very MESI! // 12. Interleaved execution -- what we don't want /* cpu0 */ /* cpu1 */ t0 = C; // t0 is 42 t0 += 1; // t0 is 43 t1 = C; // t1 is 42 t1 += 1; // t1 is 43 C = t0; // C is now 43 C = t1; // C is now 43 // 13. Serialization -- what we want /* cpu0 */ /* cpu1 */ t0 = C; // t0 is 42 t0 += 1; // t0 is 43 C = t0; // C is now 43 t1 = C; // t1 is 43 t1 += 1; // t1 is 44 C = t1; // C is now 44 // 14. Counter with lock -- recap while (test_and_set(&L)) continue; t = C; // critical section begins t += 1; C = t; // critical section ends L = 0; // 14. Acquire/release operations -- single-thread view while (test_and_set(&L)) continue; ACQUIRE_BARRIER(); t = C; // critical section begins t += 1; C = t; // critical section ends RELEASE_BARRIER(); L = 0; // 15. Release/acquire operations -- multi-thread view /* cpu0 */ /* cpu1 */ ... t += 1; C = t; // critical section A ends RELEASE_BARRIER(); L = 0; /// SYNCHRONIZATION EVENT OCCURS HERE AT L /// while (test_and_set(&L)) continue; ACQUIRE_BARRIER(); t = C; // critical section B begins ... // What we want: A happens before B // 16. Release/acquire operations -- multi-thread view /* cpu0 */ /* cpu1 */ ... t += 1; C = t; // critical section A ends RELEASE_BARRIER(); L = 0; /// SYNCHRONIZATION EVENT OCCURS HERE AT L /// while (test_and_set(&L)) continue; ACQUIRE_BARRIER(); t = C; // critical section B begins ... // What we want: A happens before B // How we get it: // - Barrier to require A happens before store L = 0 // - Barrier to require load L (provided it is zero) happens before B // 17. Release/acquire operations -- as spelled in NetBSD /* cpu0 */ /* cpu1 */ ... t += 1; C = t; // critical section A ends membar_release(); L = 0; /// SYNCHRONIZATION EVENT OCCURS HERE AT L /// while (test_and_set(&L)) continue; membar_acquire(); t = C; // critical section B begins ... // What we want: A happens before B // How we get it: // - Barrier to require A happens before store L = 0 // - Barrier to require load L (provided it is zero) happens before B // 18. Optimization: store-release and load-acquire operations /* cpu0 */ /* cpu1 */ ... t += 1; C = t; // critical section A ends // store-release atomic_store_release(&L, 0); // L = 0 /// SYNCHRONIZATION EVENT OCCURS HERE AT L /// while (atomic_swap_acquire(&L, 1)) // (*) continue; // load-acquire t = C; // critical section B begins ... // What we want: A happens before B // How we get it: // - A happens before store-release at L // - Load-acquire happens before B // (*) not (yet) available in NetBSD, shorthand for C11 // atomic_exchange_explicit(p, x, memory_order_acquire), // 19. Avoiding contention -- unlocked table lookup, first try struct kmutex tablock; struct record { bool occupied; struct kmutex lock; int tag; int data; ... } recordtab[1024]; /* cpu0 */ /* cpu1 */ mutex_enter(&tablock); i = allocate_record(); recordtab[i].tag = 1234; mutex_init(&recordtab[i].lock, ...); // try lookup -- not yet done, should fail recordtab[i].data = 1; if (!recordtab[i].occupied) recordtab[i].occupied = true; goto fail; mutex_exit(&tablock); // try lookup again -- done, should succeed if (!recordtab[i].occupied) goto fail; tag = recordtab[i].tag; mutex_enter(&recordtab[i].lock); use_data(recordtab[i].data++); ... // 20. Avoiding contention -- unlocked table lookup, publisher missing barrier /* original code */ /* equivalent, according to compiler */ mutex_enter(&tablock); mutex_enter(&tablock); i = allocate_record(); i = allocate_record(); recordtab[i].tag = 1234; recordtab[i].occupied = true; mutex_init(&recordtab[i].lock, ...); recordtab[i].x = 1; recordtab[i].x = 1; recordtab[i].tag = 1234; recordtab[i].occupied = true; mutex_init(&recordtab[i].lock, ...); mutex_exit(&tablock); mutex_exit(&tablock); // 21. Avoiding contention -- unlocked table lookup, user missing barrier /* original code */ /* equivalent code, according to CPU */ tag = recordtab[i].tag; if (!recordtab[i].occupied) if (!recordtab[i].occupied) goto fail; goto fail; tag = recordtab[i].tag; mutex_enter(&recordtab[i].lock); mutex_enter(&recordtab[i].lock); use_data(recordtab[i].data++); use_data(recordtab[i].data++); ... ... // 23. Avoiding contention -- unlocked table lookup with barriers /* cpu0 */ /* cpu1 */ mutex_enter(&tablock); i = allocate_record(); recordtab[i].tag = 1234; mutex_init(&recordtab[i].lock, ...); recordtab[i].data = 1; // critical section A ends membar_release(); recordtab[i].occupied = true; /// SYNCHRONIZATION EVENT OCCURS HERE AT recordtab[i].occupied /// if (!recordtab[i].occupied) goto fail; membar_acquire(); // critical section B begins tag = recordtab[i].tag; mutex_enter(&recordtab[i].lock); use_data(recordtab[i].data++); ... // 24. Avoiding contention -- unlocked table lookup with release/acquire /* cpu0 */ /* cpu1 */ mutex_enter(&tablock); i = allocate_record(); recordtab[i].tag = 1234; mutex_init(&recordtab[i].lock, ...); recordtab[i].data = 1; // critical section A ends atomic_store_release(&recordtab[i].occupied, true); /// SYNCHRONIZATION EVENT OCCURS HERE AT recordtab[i].occupied /// p = &recordtab[i].occupied; if (!atomic_load_acquire(p)) goto fail; // critical section B begins tag = recordtab[i].tag; mutex_enter(&recordtab[i].lock); use_data(recordtab[i].data++); ... // 25. Reference counts -- first try // (pretend --obj->refcnt is atomic) x = obj->data; if (--obj->refcnt == 0) { /* last reference, destroy data and free */ memset(obj, 0x5a, sizeof(*obj)); free(obj); } return x; // 26. Reference counts -- missing barriers, equivalent sequential code // (pretend --obj->refcnt is atomic) if (--obj->refcnt == 0) { x = obj->data; /* last reference, destroy data and free */ memset(obj, 0x5a, sizeof(*obj)); free(obj); } else { x = obj->data; } return x; // 26. Reference counts -- missing barriers, running on two CPUs // (pretend --obj->refcnt is atomic) /* cpu0 */ /* cpu1 */ if (--obj->refcnt == 0) { /* down to 1, not taken */ if (--obj->refcnt == 0) { ... /* down to 0, taken */ memset(obj, 0x5a, sizeof(*obj)); } else { /* loads 0x5a5a5a5a */ x = obj->data; } free(x); } else { /* not taken */ ... } return x; /* returns 0x5a5a5a5a */ return x; /* returns 42 */ // 27. Reference counts -- with barriers // (pretend --obj->refcnt is atomic) x = obj->data; membar_release(); if (--obj->refcnt == 0) { membar_acquire(); memset(obj, 0x5a, sizeof(*obj)); free(obj); } return x; // 28. Reference counts -- with barriers and atomics, of course x = obj->data; membar_release(); if (atomic_dec_uint_nv(&obj->refcnt) == 0) { membar_acquire(); memset(obj, 0x5a, sizeof(*obj)); free(obj); } return x; // 29. Optimization: write-only producers, read-only consumers (Linux seqlock) unsigned version; // version number unsigned stats[128]; // set of event counters /* producer (serialized by lock) */ /* consumer (any number in parallel) */ version |= 1; /* begin write */ while ((v = version) & 1) continue; /* await write */ membar_producer(); membar_consumer(); stats[0] = ...; sum = f(stats[0], stats[1], ...) stats[1] = ...; ... membar_producer(); membar_consumer(); version += 1; /* end write */ if (v != version) goto top; // 30. Optimization: write-only producers, read-only consumers (Lamport) unsigned v1, v2; // version numbers -- different while write in progress unsigned stats[128]; // set of event counters /* producer (serialized by lock) */ /* consumer (any number in parallel) */ v1++; /* write in progress */ v = v2; /* read versions in reverse */ membar_producer(); membar_consumer(); stats[0] = ...; sum = f(stats[0], stats[1], ...) stats[1] = ...; ... membar_producer(); membar_consumer(); v2++; /* end write */ if (v != v1) goto top; // 31. Red flag: Store-before-load // Do not use membar_sync without good and clearly documented reason! // // membar_sync = load-before-load // and load-before-store // and store-before-store // ...AND store-before-load. // // Very difficult to reason about! // Legacy barriers: // // membar_enter = store-before-load and load-before-load // -> not really useful, not cheaper than membar_sync anywhere // membar_exit = load-before-store and store-before-store // -> legacy alias for membar_release // 32. Red flags: Dekker's algorithm (two-thread version, thread 0 & thread 1) int waiting[2]; int turn; void lock(int self) { top: waiting[self] = 1; membar_sync(); // store-before-load barrier just for Dekker while (waiting[1 ^ self]) { if (turn != self) { waiting[self] = 0; while (turn != self) continue; goto top; } } membar_acquire(); // acquire barrier for normal lock acquisition } void unlock(int self) { membar_release(); // release barrier for normal lock acquisition turn = 1 ^ self; waiting[self] = 0; } // 33. Other barriers: bus_space_barrier -- write-combining buffers bus_space_map(framebuftag, framebufaddr, framebufsize, BUS_SPACE_MAP_LINEAR|BUS_SPACE_MAP_PREFETCHABLE, &framebufhandle); fb = bus_space_vaddr(framebuftag, framebufhandle); memcpy(fb, ...); bus_space_barrier(framebuftag, framebufhandle, BUS_SPACE_BARRIER_WRITE); memcpy(fb, ...); // 34. Other barriers: bus_dmamap_sync -- CPU vs (bouncy) DMA I/O engine /* tx output routine */ bus_dmamap_sync(..., txbuf_map, 0, pktlen, ...); txdesc->addr = ...; txdesc->pktlen = pktlen; bus_dmamap_sync(..., txdesc_map, txdesc_off, sizeof(*txdesc), ...) txdesc->own = 0; bus_space_write_4(sc->sc_iot, sc->sc_ioh, DOORBELL, 1); /* rx interrupt routine */ bus_dmamap_sync(..., rxdesc_off, sizeof(*rxdesc), ...); if (!rxdesc->own) break; bus_dmamap_sync(..., rxbuf_map, 0, rxdesc->pktlen, ...); process_packet(rxbuf); // Also: Most barriers are in the implementation of synchronization // abstractions! Not in regular code. // 35. Questions? ??????? ???????????? ???? ????? ??? ???? ???? ??? ??? ?? ??? ???? ??? ???? ??