/* Word segmentation, following
   <http://thenoisychannel.com/2011/08/08/retiring-a-great-interview-problem/> 
*/
#define _BSD_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

void insert(char *s);
char **bucket(char *s, int len);
char *xstrdup(char *s);
void *xmalloc(size_t size);
void die(char *why);
void load_dict(char *filename);
void clear_dict();
int *segment(char *words);

int main(int argc, char **argv)
{
     int *word_lengths;
     int word_pointer = 0;
     if (argc != 2) {
          die("usage: wordseg somechunkofwords");
     }
     load_dict("/usr/share/dict/words");
     
     word_lengths = segment(argv[1]);
     if (!word_lengths) {
          printf("no segmentation found for %s\n", argv[1]);
          return 1;
     }
     
     for (;;) {
          printf("%.*s", word_lengths[word_pointer], &argv[1][word_pointer]);
          word_pointer += word_lengths[word_pointer];

          /* A zero-length word terminates the segmentation. */
          if (!word_lengths[word_pointer]) {
     break;
          }
          printf(" ");
     }
     printf("\n");

     free(word_lengths);
     clear_dict();

     return 0;
}


/* Dictionary file */

enum { maxwordlen = 32 } ; 

void load_dict(char *filename)
{
     FILE *f = fopen(filename, "r");
     char buf[maxwordlen];
     
     if (!f) {
          die(filename);
     }
     
     while (fgets(buf, sizeof buf, f)) {
          char *nl;
          if (strlen(buf) == maxwordlen - 1) {
               die("line too long in dictionary");
          }
          
          if ((nl = strchr(buf, '\n'))) {
               *nl = '\0';
          }
          insert(xstrdup(buf));
     }
     
     fclose(f);
}

/* Segmentation by dynamic programming */

int *make_segmentation(int *seglen, int nchars);

int *segment(char *s) 
{
     int nchars = strlen(s);
     /* seglen[i] for i > 0 is 0 iff there is no known segmentation of
      * the first i characters of s, which are at indices 0, 1,
      * ... i-1.  Otherwise, it’s the length of the last word of that
      * segmentation, and the previous word’s length is to be found at
      * seglen[i - seglen[i]], unless seglen[i] == i, in which case
      * there is no previous word.
      */
     int *seglen = xmalloc(sizeof(*seglen) * (nchars+1));
     int i, j;
     
     for (i = 0; i != nchars+1; i++) {
          seglen[i] = 0;

          for (j = 0; j != i; j++) {
               /* See if there’s a segmentation ending at j that can
                * be extended with a word to i. */
               if (j != 0 && !seglen[j]) {
                    /* There’s no segmentation ending at j. */
                    continue;
               }
               /* Is there a word? */
               if (*bucket(&s[j], i-j)) {
                    /* Yes, that’s a word. */
                    seglen[i] = i-j;
                    break;
               }
          }
     }

     if (!seglen[nchars]) {
          return 0;
     }
     
     return make_segmentation(seglen, nchars);
}

/* See comments inside segment() for original meaning of the contents
 * of word_lengths.  This function “reverses the pointers” to make the
 * forward-iterable version that main() expects, leaving the rest of
 * word_lengths as useless garbage.
 */
int *make_segmentation(int *word_lengths, int nchars)
{
     int last_length = 0;       /* A zero-length word terminates, etc. */
     int word_pointer = nchars;
     do {
          int length = word_lengths[word_pointer];
          word_lengths[word_pointer] = last_length;
          word_pointer -= length;
          last_length = length;
     } while (last_length);
     
     return word_lengths;
}


/* Hash table */

/*
 * Open addressing with linear probing.
 * Hash table being full is “reported” by looping infinitely.
 */

enum { largeprime = 123456791, dictsize = 200003 };

int hash(char *s, int len);
static char *dict[dictsize];

char **bucket(char *s, int len)
{
     int rv = hash(s, len) % dictsize;
     while (dict[rv] && (len != strlen(dict[rv]) ||
                         memcmp(dict[rv], s, len))) {
          rv = (rv + 1) % dictsize;
     }
     return &dict[rv];
}

void insert(char *s)
{
     *bucket(s, strlen(s)) = s;
}

int hash(char *s, int len)
{
     unsigned int rv = 0;
     
     while (len--) {
          rv = (rv * 256 + *s++) % largeprime;
     }
     return rv;
}

void clear_dict()
{
     int i;
     for (i = 0; i != dictsize; i++) {
          free(dict[i]);
     }
}


/* Memory allocation */

char *xstrdup(char *s) 
{
     char *rv = strdup(s);
     if (!rv) {
          die("strdup");
     }
     return rv;
}

void *xmalloc(size_t size)
{
     void *rv = malloc(size);
     if (!rv) {
          die("malloc");
     }
     return rv;
}


/* Error reporting */

void die(char *why)
{
     perror(why);
     abort();
}

