// Basic objective: come up with a smarter version of the standard
// `strings` utility that will find "_Exit" and "trim" as strings, but
// reject "<!tu<*tq<(tm<)f".

// Intended approach: tabulate a dimension-reduced vector of byte
// 2-gram frequencies, using some training text to find clusters of
// similar characters, and making 2-grams of the cluster IDs rather
// than of the actual bytes; use the dimension-reduced 2-gram model
// somehow to discriminate between "text" and "non-text".  Maybe
// measure strings against a few different "source models", such as a
// "random binary data source", an "English text source", a "C text
// source", a "random hexadecimal bytes source", and a "random
// printable ASCII source", and see which one produces the lowest
// entropy estimate.  That still leaves the question of where to cut
// the text into strings, of course... maybe recognize when your last
// four bytes have changed language, then look for the lowest-entropy
// cut point?

// First step: try to cluster bytes.  The standard approach to this is
// to use K-means.  I have a different, cruder idea: I will initially
// categorize each byte in a cluster named by the byte that most often
// precedes it; I will then reduce the number of clusters to a
// specified number, and assign each byte to the remaining cluster
// whose name most often precedes it.

// I think the crudity of the results from this program justifies
// going to the effort to try K-means and similar classifiers.  Here's
// the output on 270 megabytes of essentially English text from
// Wikipedia with 16 clusters:

/*
\x00: \x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xd9
\x09: \x09
\x0a: \x0a;>}
\x20: !"%&)*+,-.345678:?@X[\^defgnsy|~\xa2\xa4\xa5\xa7\xa8\xa9\xaa\xab\xae\xaf\xb1\xb2\xb3\xb6\xb9\xba\xbb\xbe\xc0\xcb\xd0\xd3\xd7\xe0\xe9\xec\xf2\xf7\xf9\xfd
": =
): \xdb
.: E
/: \xd5
0: 02
1: #$(\xa3
9: 19
<: \x00
A: F`
C: \xb0
D: N
E: L\xa1
F: O
M: \xa0
N: \xc2\xcc\xd2\xd4
Q: \xbf
R: A
S: \xc8
]: ]
_: _
a: BJMPWw\xa6\xc7\xcf\xd1\xed\xf0\xf1
b: \xdc
c: \xc9\xff
d: /<
e: DGHRZbchklmrvz\xb8\xdf\xf5
h: Tt
i: KV\xac\xb4\xc3\xfe
l: \xc1\xc4\xc5\xce\xda\xeb\xfb
m: \xb5\xb7
n: IUaio\xe1\xe4\xee\xf3\xfa
o: CYj\xde\xe3\xe7
r: pu\xe5\xe6\xe8\xf6\xf8\xfc
s: '\xcd\xd6\xd8
t: \x20Sx\xad\xc6\xca\xe2\xea\xef\xf4
u: Qq\xbc\xbd\xdd
{: {

Reduced to 16 clusters:
\x00: \x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xd9\xdb\xdd
\x0a: \x0a;>}
\x20: \x09!"%&')*+,-.345678:=?@IUX[\]^defgnoqsxy{|~\xa2\xa4\xa5\xa7\xa8\xa9\xaa\xab\xae\xaf\xb0\xb1\xb2\xb3\xb6\xb9\xba\xbb\xbc\xbe\xc0\xc8\xca\xcb\xcd\xd0\xd3\xd7\xd8\xdc\xe0\xe1\xe3\xe5\xe9\xea\xec\xee\xf2\xf3\xf4\xf7\xf9\xfa\xfd
0: 02
1: #$(\xa3
9: 19
A: F`\xb5\xbf
E: L\xa1
N: E\xc2\xcc\xd2\xd4
a: \x20BCJMPWYw\xa6\xc7\xcf\xd1\xd5\xe7\xed\xf0\xf1
c: Oi\xbd\xc9\xe8\xff
d: /<\xe6\xef\xf8
e: DGHNRSZbchjklmprvz\xb8\xdf\xf5
h: Tt
i: KQV_\xa0\xac\xb4\xc3\xde\xfe
l: Aau\xad\xb7\xc1\xc4\xc5\xc6\xce\xd6\xda\xe2\xe4\xeb\xf6\xfb\xfc

That looks sort of reasonable in some ways --- it's successfully
lumped most of the consonants together, recognized that 'T' and 't' are
a separate class in that it tends to precede 'h', separated out the
control characters as bytes that don't occur, and managed to cordon
off most of the punctuation in its own category, except for
punctuation that tends to precede numbers.  But it's put ' ' (\x20) in
the same category as the less-common consonants, like B, C, and J!  If
I swizzle the order so it's grouping characters that *precede* the
same thing, I get totally different but equally unreasonable results:

Reduced to 16 clusters:
\x00: \x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xd9
\x0a: \x0a!<\xb7
\x20: \x09"#$&(*+-/123;>@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^`abchijmoptw{|}~\xa1\xa3\xa4\xa5\xa6\xa7\xa9\xab\xac\xaf\xb1\xb4\xb5\xb6\xbb\xbc\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xda\xdb\xdc\xdd\xde\xe0\xf7\xfe
0: %0\xa0\xa2\xb0\xba
1: 89\xaa
9: 4567
C: \xbd\xf4
a: kl\xdf\xee\xef\xf1\xfa\xff
d: =\xe9\xf8\xfb
e: \x20),.:?_qrsx\xad
h: e\xe1\xe2\xf6
i: nvz\xa8\xb8\xb9\xe8\xea\xf0\xf3\xf9
l: y\xe4\xe6\xec\xed\xf2\xfc
m: \xb2\xe5\xf5
n: 'dg\xae\xb3\xe7\xfd
o: fu\xe3\xeb

Now the entire uppercase and most of the lowercase alphabet are lumped
together with punctuation as "things that tend to precede spaces".
This is not going to help me accept "trim" but not "<!tu<*tq<(tm<)f".

I think there might be enough structure to the problem of assigning
bytes to classes that I can come up with a metric of goodness.

*/

#include <assert.h>
#include <stdlib.h>
#include <stdio.h>

static void note_bytepair(unsigned pc, unsigned c);
static void cluster(void);
static void display_clusters(void);
static void reduce_clusters(int n_clusters);
     
int 
main(int argc, char **argv)
{
     int c, pc = 0, n_clusters = argc > 1 ? atoi(argv[1]) : 4;
  
     while ((c = getchar()) != EOF) {
          note_bytepair(pc, c);
          pc = c;
     }

     cluster();
     display_clusters();

     printf("\nReduced to %d clusters:\n", n_clusters);
     reduce_clusters(n_clusters);
     display_clusters();
     
     return 0;
}

static int counts[256][256];
static void
note_bytepair(unsigned pc, unsigned c)
{
     counts[pc][c]++;
}

static unsigned char cluster_of[256];
static void cluster()
{
     for (int ii = 0; ii < 256; ii++) {
          for (int jj = 0; jj < 256; jj++) {
               if (counts[ii][jj] > counts[ii][cluster_of[ii]]) cluster_of[ii] = jj;
          }
     }
}

static void display_char(int c);

static void
display_clusters()
{
     for (int ii = 0; ii < 256; ii++) {
          int line_shown = 0;
          for (int jj = 0; jj < 256; jj++) {
               if (cluster_of[jj] == ii) {
                    if (!line_shown) {
                         display_char(ii);
                         printf(": ");
                         line_shown = 1;
                    }
                    display_char(jj);
               }
               
          }
          if (line_shown) printf("\n");
     }
}

static void
display_char(int c)
{
     if (c > ' ' && c <= '~') {
          putchar(c);
     } else {
          printf("\\x%02x", c);
     }
}

static int n_clusters_bigger_than(int *cluster_sizes, int size);

static void
reduce_clusters(int n_clusters)
{
     assert(n_clusters > 0);
     
     int cluster_sizes[256] = {0};
     for (int ii = 0; ii < 256; ii++) {
          cluster_sizes[cluster_of[ii]]++;
     }

     int min_cluster_size = 0;
     while (n_clusters_bigger_than(cluster_sizes, 
                                   min_cluster_size) >= n_clusters) {
          min_cluster_size++;
     }

     // Okay, now we have a minimum cluster size, and we know the
     // sizes of the original clusters; now we must choose which
     // clusters are eligible.
     char cluster_eligible[256] = {0};
     int n_eligible_clusters = 0;
     int an_eligible_cluster;
     
     for (int ii = 0; ii < 256; ii++) {
          if (n_eligible_clusters < n_clusters && 
              cluster_sizes[ii] >= min_cluster_size) {
               cluster_eligible[ii] = 1;
               if (!n_eligible_clusters) an_eligible_cluster = ii;
               n_eligible_clusters++;
          }
     }

     // And now we must redo the clustering work with only the
     // eligible clusters.  Start by initializing the clusters to a
     // valid state.
     for (int ii = 0; ii < 256; ii++) cluster_of[ii] = an_eligible_cluster;

     // Now we redo cluster(), with an extra criterion.
     for (int ii = 0; ii < 256; ii++) {
          for (int jj = 0; jj < 256; jj++) {
               if (!cluster_eligible[jj]) continue;
               if (counts[ii][jj] > counts[ii][cluster_of[ii]]) cluster_of[ii] = jj;
          }
     }
}

static int
n_clusters_bigger_than(int *cluster_sizes, int size)
{
     int count = 0;
     for (int ii = 0; ii < 256; ii++) {
          if (cluster_sizes[ii] > size) count++;
     }
     return count;
}