// Basic objective: come up with a smarter version of the standard // `strings` utility that will find "_Exit" and "trim" as strings, but // reject "} \x20: !"%&)*+,-.345678:?@X[\^defgnsy|~\xa2\xa4\xa5\xa7\xa8\xa9\xaa\xab\xae\xaf\xb1\xb2\xb3\xb6\xb9\xba\xbb\xbe\xc0\xcb\xd0\xd3\xd7\xe0\xe9\xec\xf2\xf7\xf9\xfd ": = ): \xdb .: E /: \xd5 0: 02 1: #$(\xa3 9: 19 <: \x00 A: F` C: \xb0 D: N E: L\xa1 F: O M: \xa0 N: \xc2\xcc\xd2\xd4 Q: \xbf R: A S: \xc8 ]: ] _: _ a: BJMPWw\xa6\xc7\xcf\xd1\xed\xf0\xf1 b: \xdc c: \xc9\xff d: /< e: DGHRZbchklmrvz\xb8\xdf\xf5 h: Tt i: KV\xac\xb4\xc3\xfe l: \xc1\xc4\xc5\xce\xda\xeb\xfb m: \xb5\xb7 n: IUaio\xe1\xe4\xee\xf3\xfa o: CYj\xde\xe3\xe7 r: pu\xe5\xe6\xe8\xf6\xf8\xfc s: '\xcd\xd6\xd8 t: \x20Sx\xad\xc6\xca\xe2\xea\xef\xf4 u: Qq\xbc\xbd\xdd {: { Reduced to 16 clusters: \x00: \x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xd9\xdb\xdd \x0a: \x0a;>} \x20: \x09!"%&')*+,-.345678:=?@IUX[\]^defgnoqsxy{|~\xa2\xa4\xa5\xa7\xa8\xa9\xaa\xab\xae\xaf\xb0\xb1\xb2\xb3\xb6\xb9\xba\xbb\xbc\xbe\xc0\xc8\xca\xcb\xcd\xd0\xd3\xd7\xd8\xdc\xe0\xe1\xe3\xe5\xe9\xea\xec\xee\xf2\xf3\xf4\xf7\xf9\xfa\xfd 0: 02 1: #$(\xa3 9: 19 A: F`\xb5\xbf E: L\xa1 N: E\xc2\xcc\xd2\xd4 a: \x20BCJMPWYw\xa6\xc7\xcf\xd1\xd5\xe7\xed\xf0\xf1 c: Oi\xbd\xc9\xe8\xff d: /<\xe6\xef\xf8 e: DGHNRSZbchjklmprvz\xb8\xdf\xf5 h: Tt i: KQV_\xa0\xac\xb4\xc3\xde\xfe l: Aau\xad\xb7\xc1\xc4\xc5\xc6\xce\xd6\xda\xe2\xe4\xeb\xf6\xfb\xfc That looks sort of reasonable in some ways --- it's successfully lumped most of the consonants together, recognized that 'T' and 't' are a separate class in that it tends to precede 'h', separated out the control characters as bytes that don't occur, and managed to cordon off most of the punctuation in its own category, except for punctuation that tends to precede numbers. But it's put ' ' (\x20) in the same category as the less-common consonants, like B, C, and J! If I swizzle the order so it's grouping characters that *precede* the same thing, I get totally different but equally unreasonable results: Reduced to 16 clusters: \x00: \x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xd9 \x0a: \x0a!<\xb7 \x20: \x09"#$&(*+-/123;>@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^`abchijmoptw{|}~\xa1\xa3\xa4\xa5\xa6\xa7\xa9\xab\xac\xaf\xb1\xb4\xb5\xb6\xbb\xbc\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xda\xdb\xdc\xdd\xde\xe0\xf7\xfe 0: %0\xa0\xa2\xb0\xba 1: 89\xaa 9: 4567 C: \xbd\xf4 a: kl\xdf\xee\xef\xf1\xfa\xff d: =\xe9\xf8\xfb e: \x20),.:?_qrsx\xad h: e\xe1\xe2\xf6 i: nvz\xa8\xb8\xb9\xe8\xea\xf0\xf3\xf9 l: y\xe4\xe6\xec\xed\xf2\xfc m: \xb2\xe5\xf5 n: 'dg\xae\xb3\xe7\xfd o: fu\xe3\xeb Now the entire uppercase and most of the lowercase alphabet are lumped together with punctuation as "things that tend to precede spaces". This is not going to help me accept "trim" but not " #include #include static void note_bytepair(unsigned pc, unsigned c); static void cluster(void); static void display_clusters(void); static void reduce_clusters(int n_clusters); int main(int argc, char **argv) { int c, pc = 0, n_clusters = argc > 1 ? atoi(argv[1]) : 4; while ((c = getchar()) != EOF) { note_bytepair(pc, c); pc = c; } cluster(); display_clusters(); printf("\nReduced to %d clusters:\n", n_clusters); reduce_clusters(n_clusters); display_clusters(); return 0; } static int counts[256][256]; static void note_bytepair(unsigned pc, unsigned c) { counts[pc][c]++; } static unsigned char cluster_of[256]; static void cluster() { for (int ii = 0; ii < 256; ii++) { for (int jj = 0; jj < 256; jj++) { if (counts[ii][jj] > counts[ii][cluster_of[ii]]) cluster_of[ii] = jj; } } } static void display_char(int c); static void display_clusters() { for (int ii = 0; ii < 256; ii++) { int line_shown = 0; for (int jj = 0; jj < 256; jj++) { if (cluster_of[jj] == ii) { if (!line_shown) { display_char(ii); printf(": "); line_shown = 1; } display_char(jj); } } if (line_shown) printf("\n"); } } static void display_char(int c) { if (c > ' ' && c <= '~') { putchar(c); } else { printf("\\x%02x", c); } } static int n_clusters_bigger_than(int *cluster_sizes, int size); static void reduce_clusters(int n_clusters) { assert(n_clusters > 0); int cluster_sizes[256] = {0}; for (int ii = 0; ii < 256; ii++) { cluster_sizes[cluster_of[ii]]++; } int min_cluster_size = 0; while (n_clusters_bigger_than(cluster_sizes, min_cluster_size) >= n_clusters) { min_cluster_size++; } // Okay, now we have a minimum cluster size, and we know the // sizes of the original clusters; now we must choose which // clusters are eligible. char cluster_eligible[256] = {0}; int n_eligible_clusters = 0; int an_eligible_cluster; for (int ii = 0; ii < 256; ii++) { if (n_eligible_clusters < n_clusters && cluster_sizes[ii] >= min_cluster_size) { cluster_eligible[ii] = 1; if (!n_eligible_clusters) an_eligible_cluster = ii; n_eligible_clusters++; } } // And now we must redo the clustering work with only the // eligible clusters. Start by initializing the clusters to a // valid state. for (int ii = 0; ii < 256; ii++) cluster_of[ii] = an_eligible_cluster; // Now we redo cluster(), with an extra criterion. for (int ii = 0; ii < 256; ii++) { for (int jj = 0; jj < 256; jj++) { if (!cluster_eligible[jj]) continue; if (counts[ii][jj] > counts[ii][cluster_of[ii]]) cluster_of[ii] = jj; } } } static int n_clusters_bigger_than(int *cluster_sizes, int size) { int count = 0; for (int ii = 0; ii < 256; ii++) { if (cluster_sizes[ii] > size) count++; } return count; }