xefyl/wordenizer.c

/* Tokenize words, letter by letter.
 * Supports Latin characters.
 * Compile with gcc
 *
 * (c) 2021 Sakuragasaki46
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <locale.h>
#include <wctype.h>
#include <ctype.h>


// user interface
const char HELP[] = "%s - words in a text file\n\
Prints a list of words in lowercase form, one per line.\n\
Supports UTF-8 strings.\n\
\n\
Usage: %s [filename]\n\
\n\
Arguments:\n\
\tfilename\t\tfilename to analyze (defaults to stdin)\n\
\t-s\t\tcase sensitive\n\
\t-n\t\tdon’t normalize non-ASCII characters\n\
\t--help\t\tshows this help message and exits\n\
\t--version\t\tshow version and exit\n";
const char VERSION[] = "0.2";

// behavior switches
int cASE_SENSITIVE = 0;
int nO_NORMALIZE = 0;

// table data for normCharcode()
// if character in first table is uppercase, data from second table
// is read at same position

// latin-1
const char ASCII_DATA_192[] =
  "aaaaaaAceeeeiiiiDnooooo OuuuuyTSaaaaaaAceeeeiiiiDnooooo OuuuuyTy";
const char ASCII_DATA_192_B[] =
  "      e         h       e     hs      e         h       e     h ";

// latin extended A
const char ASCII_DATA_U0100[] =
  "aaaaaaccCCccCCddDDeeeeeeeeYYgggggggghhhhiiiiiiiiiiIIjjkkklllllll"
  "lllnnnnnnnNNooooooOOrrrrrrssSSSSSSTTttttuuuuuuOOuuuuwwyyyZZzzZZs";
const char ASCII_DATA_U0100_B[] = 
  "        hh  hh  jj        ee                      jj            "
  "          gg      ee        hhhhhhss          uu         hh  hh ";


typedef struct string_linkedlist_s {
  char * s;
  size_t len;
  size_t bufsize;
  //struct string_linkedlist_s *next;
} StringLL;

int main_file(const char* filename);
int main_stdin(void);
int tok_words(FILE *fr);
int readCharcode(FILE* fh);
int readWagonChar(FILE *fh);
int normCharcode(char *s, size_t pos, size_t offset, const char *table1, const char *table2);

StringLL* StringLL_new();
StringLL* StringLL_grow(StringLL*);
StringLL* StringLL_appendchar(StringLL*, int);
StringLL* StringLL_next_print(StringLL*);
void StringLL_destroy(StringLL*);

int main(int argc, char *argv[]) {
  int curarg = 1;

  // set locale, otherwise towlower() does not work
  setlocale(LC_ALL, "en_US.UTF-8");
  
  if (argc == 1) {
    return main_stdin();
  }
  while (curarg < argc){
    if (!strcmp(argv[1], "--help")) {
      printf(HELP, argv[0], argv[0]);
      exit(0);
    } else if (!strcmp(argv[curarg], "--version")) {
      puts(VERSION);
      exit(0);
    } else if (!strcmp(argv[curarg], "-s")) {
      cASE_SENSITIVE = 1;
    } else if (!strcmp(argv[curarg], "-n")) {
      nO_NORMALIZE = 1;
    } else if (!strcmp(argv[curarg], "-") || argv[curarg][0] == '\0') {
      return main_stdin();
    } else if (strncmp(argv[curarg], "-", 1)) {
      return main_file(argv[curarg]);
    } else {
      fprintf(stderr, "Unknown option: \"%s\"\n", argv[curarg]);
      return 1;
    }
  }

  return 0;
};

int main_file(const char* filename){
  FILE *fh;

  fh = fopen(filename, "r");
  if (!fh) {
    fprintf(stderr, "[Errno %d] Could not open file \"%s\"\n", errno, filename);
    exit(1);
  }

  return tok_words(fh);
}

int main_stdin(void){
  return tok_words(stdin);
}

/**
 * Main activity.
 *
 * @param fh a read-only file handle
 */
int tok_words(FILE *fh){
  int charcode = 0, intoword = 0;
  StringLL *lend;

  lend = StringLL_new();
  
  while ((charcode = readCharcode(fh)) >= 0) {
    if (iswalpha(charcode)){
      intoword++;

      // locale lower case
      if (!cASE_SENSITIVE) {
	charcode = towlower(charcode);
      }
      
      lend = StringLL_appendchar(lend, charcode);
    } else if (intoword > 0) {
      intoword = 0;

      lend = StringLL_next_print(lend);
    }
  }

  if (intoword){
    lend = StringLL_next_print(lend);
  }
  
  StringLL_destroy(lend);
  return 0;
}

/**
 * Read an UTF-8 character, and return its code.
 * Returns a non-negative value on success,
 * -1 on EOF.
 */
int readCharcode(FILE* fh){
  int c, c2;

  c = fgetc(fh);
  if (c < 0) return -1;
  if (0 <= c && c < 128) return c;
  else if (192 <= c && c < 224){
    c -= 192;
    c *= 64;
    c2 = readWagonChar(fh);
    if (c2 < 0) return 0;
    c += c2;
    return c;
  } else if (224 <= c && c < 240) {
    c -= 224;
    c *= 64;
    c2 = readWagonChar(fh);
    if (c2 < 0) return 0;
    c += c2;
    c *= 64;
    c2 = readWagonChar(fh);
    if (c2 < 0) return 0;
    c += c2;
    return c;
  } else {
    return 0;
  }
}

int readWagonChar(FILE * fh){
  int c;

  c = fgetc(fh);
  if (c < 128 || c >= 192) return -1;
  return c - 128;
}

int normCharcode(char * s, size_t pos, size_t offset, const char *table1, const char *table2){
  char c1;

  // if character in first table is uppercase, data from second table
  // is read at same position
  c1 = table1[offset];
  if (c1 == ' '){
    return 0;
  } else if (isupper(c1)) {
    s[pos++] = tolower(c1);
    s[pos++] = table2[offset];
    return 2;
  } else {
    s[pos++] = c1;
    return 1;
  }
}

/***** StringLL functions *******/


StringLL* StringLL_new () {
  StringLL* l;

  l = (StringLL*) malloc (sizeof(StringLL));
  l->bufsize = 16;
  l->s = (char *) malloc(l->bufsize);
  l->len = 0;

  return l;
}

StringLL* StringLL_grow (StringLL* l){
  l->bufsize *= 2;
  l->s = (char*) realloc(l->s, l->bufsize);
  return l;
}

StringLL* StringLL_appendchar(StringLL* l, int c){
  if (c == 0) {
    return l;
  }

  if (l->bufsize - l->len <= 4){
    l = StringLL_grow(l);
  }

  if (c < 128){
    // ascii
    l->s[l->len++] = (char) c;
  } else if (!nO_NORMALIZE && 192 <= c && c < 256) {
    // latin-1 supplement
    l->len += normCharcode(l->s, l->len, c - 192, ASCII_DATA_192, ASCII_DATA_192_B);
  } else if (!nO_NORMALIZE && 256 <= c && c < 384) {
    // latin extended-A
    l->len += normCharcode(l->s, l->len, c - 256, ASCII_DATA_U0100, ASCII_DATA_U0100_B);
  } else if (c < 0x800) {
    // 2 byte UTF-8
    l->s[l->len++] = (char) (c / 64) | 192;
    l->s[l->len++] = (char) (c % 64) | 128;
  } else if (c < 0x10000) {
    // 3 byte UTF-8
    l->s[l->len++] = (char) (c / 0x1000) | 224;
    l->s[l->len++] = (char) (c % 0x1000 / 64) | 128;
    l->s[l->len++] = (char) (c % 64) | 128;
  } else {
    // 4-byte UTF-8
    l->s[l->len++] = (char) (c / 0x40000) | 240;
    l->s[l->len++] = (char) (c % 0x40000 / 0x1000) | 128;
    l->s[l->len++] = (char) (c % 0x1000 / 64) | 128;
    l->s[l->len++] = (char) (c / 64) | 128;
  }
  return l;
}

StringLL* StringLL_next_print (StringLL *l){
  StringLL *next;
  l->s[l->len] = 0;
  printf("%s\n", l->s);
  next = StringLL_new();
  free(l->s);
  free(l);
  return next;
}

void StringLL_destroy (StringLL *l){
  free(l->s);
  free(l);
}