#include #include #include #if defined(__GNUC__) && \ (defined(__x86_64__) || defined(__aarch64__) || \ (defined(__arm__) && defined(__ARM_NEON))) /* * GCC (and clang) do a good job of auto-vectorizing this as of 2025 on * amd64, aarch64, and armv7hf. Other architectures usually require * runtime tests for vector units. */ __attribute__((optimize("tree-vectorize"))) size_t countnl(const void *buf, size_t len) { const uint8_t *p = buf; size_t nl = 0; size_t i = 0; for (; i < len; i++) nl += (p[i] == '\n'); return nl; } #elif defined(_LP64) /* * matchnl8(x) * * Split the 64-bit quantity x into eight octets. Map each octet * to 0 if it is equal to 0x0a, and otherwise to 1. */ static inline uint64_t matchnl8(uint64_t x) { uint64_t y = x & 0x8080808080808080; /* octets with high bit set */ uint64_t z = x ^ 0x0a0a0a0a0a0a0a0a; /* zero if 0x0a, else nonzero */ z |= 0x8080808080808080; /* set borrow in all octets */ z -= 0x0101010101010101; /* clear borrow in the zero octets */ z &= 0x8080808080808080; /* just the remaining borrow bits */ z |= y; /* high bit set or borrow still set */ z >>= 7; /* 0 if 0x0a, else 1 */ return z; } size_t countnl(const void *buf, size_t len) { const uint64_t *p = buf; size_t nl = 0; size_t i = 0; /* * Bite off 4-element groups of 8-byte words. */ for (; i < 4*((len/8)/4); i += 4) { uint64_t z0 = matchnl8(p[i + 0]); uint64_t z1 = matchnl8(p[i + 1]); uint64_t z2 = matchnl8(p[i + 2]); uint64_t z3 = matchnl8(p[i + 3]); /* * Sum each octet in parallel columns. Each octet is * either 0 or 1, so the sum of each column can't * overflow and carry into another column. The result * has either 0, 1, 2, 3, or 4 in each octet, counting * the number of mismatches in that octet position * across the group. */ uint64_t z = z0 + z1 + z2 + z3; /* * Each octet of z is a number of non-0x0a octets in a * column of the input block. Sum them in the column * of a schoolbook multiplication with no carries * (maximum possible value is 4*8 = 32 = 0x20), shift * the sum column out of the result, and subtract it * from the total number of octets in the input block * to count the number of 0x0a octets. */ nl += 32 - ((z * 0x0101010101010101) >> 56); } /* * Bite off 8-byte words. */ for (; i < len/8; i++) nl += 8 - ((matchnl8(p[i]) * 0x0101010101010101) >> 56); /* * Bite off the remainder if any by zero-padding it up to an * 8-byte word. */ if (len % 8) { union { uint8_t b[8]; uint64_t i; } u; memcpy(&u.b[0], &p[i], len % 8); memset(&u.b[len % 8], 0, 8 - (len % 8)); nl += 8 - ((matchnl8(u.i) * 0x0101010101010101) >> 56); } return nl; } #else /* * matchnl4(x) * * Split the 32-bit quantity x into four octets. Map each octet * to 0 if it is equal to 0x0a, and otherwise to 1. */ static inline uint32_t matchnl4(uint32_t x) { uint32_t y = x & 0x80808080; /* bytes with high bit set */ uint32_t z = x ^ 0x0a0a0a0a; /* zero if 0x0a, else nonzero */ z |= 0x80808080; /* set borrow in all bytes */ z -= 0x01010101; /* clear borrow in the zero bytes */ z &= 0x80808080; /* just the remaining borrow bits */ z |= y; /* high bit set or borrow still set */ z >>= 7; /* 0 if 0x0a, else 1 */ return z; } size_t countnl(const void *buf, size_t len) { const uint32_t *p = buf; size_t nl = 0; size_t i = 0; for (; i < 4*((len/4)/4); i += 4) { uint32_t z0 = matchnl4(p[i + 0]); uint32_t z1 = matchnl4(p[i + 1]); uint32_t z2 = matchnl4(p[i + 2]); uint32_t z3 = matchnl4(p[i + 3]); uint32_t z = z0 + z1 + z2 + z3; nl += 16 - ((z * 0x01010101) >> 24); } for (; i < len/4; i++) nl += 4 - ((matchnl4(p[i]) * 0x01010101) >> 24); if (len % 4) { union { uint8_t b[4]; uint32_t i; } u; memcpy(&u.b[0], &p[i], len % 4); memset(&u.b[len % 4], 0, 4 - (len % 4)); nl += 4 - ((matchnl4(u.i) * 0x01010101) >> 24); } return nl; } #endif #include #include #include uint64_t buf[65536]; int main(void) { ssize_t nread; size_t nl = 0; while ((nread = read(STDIN_FILENO, buf, sizeof(buf))) != 0) { if (nread == -1) err(1, "read"); nl += countnl(buf, (size_t)nread); } printf("%zu\n", nl); fflush(stdout); return ferror(stdout); }