/* Tokenize words, letter by letter. * Supports Latin characters. * Compile with gcc * * (c) 2021 Sakuragasaki46 */ #include #include #include #include #include #include #include // user interface const char HELP[] = "%s - words in a text file\n\ Prints a list of words in lowercase form, one per line.\n\ Supports UTF-8 strings.\n\ \n\ Usage: %s [filename]\n\ \n\ Arguments:\n\ \tfilename\t\tfilename to analyze (defaults to stdin)\n\ \t-s\t\tcase sensitive\n\ \t-n\t\tdon’t normalize non-ASCII characters\n\ \t--help\t\tshows this help message and exits\n\ \t--version\t\tshow version and exit\n"; const char VERSION[] = "0.2"; // behavior switches int cASE_SENSITIVE = 0; int nO_NORMALIZE = 0; // table data for normCharcode() // if character in first table is uppercase, data from second table // is read at same position // latin-1 const char ASCII_DATA_192[] = "aaaaaaAceeeeiiiiDnooooo OuuuuyTSaaaaaaAceeeeiiiiDnooooo OuuuuyTy"; const char ASCII_DATA_192_B[] = " e h e hs e h e h "; // latin extended A const char ASCII_DATA_U0100[] = "aaaaaaccCCccCCddDDeeeeeeeeYYgggggggghhhhiiiiiiiiiiIIjjkkklllllll" "lllnnnnnnnNNooooooOOrrrrrrssSSSSSSTTttttuuuuuuOOuuuuwwyyyZZzzZZs"; const char ASCII_DATA_U0100_B[] = " hh hh jj ee jj " " gg ee hhhhhhss uu hh hh "; typedef struct string_linkedlist_s { char * s; size_t len; size_t bufsize; //struct string_linkedlist_s *next; } StringLL; int main_file(const char* filename); int main_stdin(void); int tok_words(FILE *fr); int readCharcode(FILE* fh); int readWagonChar(FILE *fh); int normCharcode(char *s, size_t pos, size_t offset, const char *table1, const char *table2); StringLL* StringLL_new(); StringLL* StringLL_grow(StringLL*); StringLL* StringLL_appendchar(StringLL*, int); StringLL* StringLL_next_print(StringLL*); void StringLL_destroy(StringLL*); int main(int argc, char *argv[]) { int curarg = 1; // set locale, otherwise towlower() does not work setlocale(LC_ALL, "en_US.UTF-8"); if (argc == 1) { return main_stdin(); } while (curarg < argc){ if (!strcmp(argv[1], "--help")) { printf(HELP, argv[0], argv[0]); exit(0); } else if (!strcmp(argv[curarg], "--version")) { puts(VERSION); exit(0); } else if (!strcmp(argv[curarg], "-s")) { cASE_SENSITIVE = 1; } else if (!strcmp(argv[curarg], "-n")) { nO_NORMALIZE = 1; } else if (!strcmp(argv[curarg], "-") || argv[curarg][0] == '\0') { return main_stdin(); } else if (strncmp(argv[curarg], "-", 1)) { return main_file(argv[curarg]); } else { fprintf(stderr, "Unknown option: \"%s\"\n", argv[curarg]); return 1; } } return 0; }; int main_file(const char* filename){ FILE *fh; fh = fopen(filename, "r"); if (!fh) { fprintf(stderr, "[Errno %d] Could not open file \"%s\"\n", errno, filename); exit(1); } return tok_words(fh); } int main_stdin(void){ return tok_words(stdin); } /** * Main activity. * * @param fh a read-only file handle */ int tok_words(FILE *fh){ int charcode = 0, intoword = 0; StringLL *lend; lend = StringLL_new(); while ((charcode = readCharcode(fh)) >= 0) { if (iswalpha(charcode)){ intoword++; // locale lower case if (!cASE_SENSITIVE) { charcode = towlower(charcode); } lend = StringLL_appendchar(lend, charcode); } else if (intoword > 0) { intoword = 0; lend = StringLL_next_print(lend); } } if (intoword){ lend = StringLL_next_print(lend); } StringLL_destroy(lend); return 0; } /** * Read an UTF-8 character, and return its code. * Returns a non-negative value on success, * -1 on EOF. */ int readCharcode(FILE* fh){ int c, c2; c = fgetc(fh); if (c < 0) return -1; if (0 <= c && c < 128) return c; else if (192 <= c && c < 224){ c -= 192; c *= 64; c2 = readWagonChar(fh); if (c2 < 0) return 0; c += c2; return c; } else if (224 <= c && c < 240) { c -= 224; c *= 64; c2 = readWagonChar(fh); if (c2 < 0) return 0; c += c2; c *= 64; c2 = readWagonChar(fh); if (c2 < 0) return 0; c += c2; return c; } else { return 0; } } int readWagonChar(FILE * fh){ int c; c = fgetc(fh); if (c < 128 || c >= 192) return -1; return c - 128; } int normCharcode(char * s, size_t pos, size_t offset, const char *table1, const char *table2){ char c1; // if character in first table is uppercase, data from second table // is read at same position c1 = table1[offset]; if (c1 == ' '){ return 0; } else if (isupper(c1)) { s[pos++] = tolower(c1); s[pos++] = table2[offset]; return 2; } else { s[pos++] = c1; return 1; } } /***** StringLL functions *******/ StringLL* StringLL_new () { StringLL* l; l = (StringLL*) malloc (sizeof(StringLL)); l->bufsize = 16; l->s = (char *) malloc(l->bufsize); l->len = 0; return l; } StringLL* StringLL_grow (StringLL* l){ l->bufsize *= 2; l->s = (char*) realloc(l->s, l->bufsize); return l; } StringLL* StringLL_appendchar(StringLL* l, int c){ if (c == 0) { return l; } if (l->bufsize - l->len <= 4){ l = StringLL_grow(l); } if (c < 128){ // ascii l->s[l->len++] = (char) c; } else if (!nO_NORMALIZE && 192 <= c && c < 256) { // latin-1 supplement l->len += normCharcode(l->s, l->len, c - 192, ASCII_DATA_192, ASCII_DATA_192_B); } else if (!nO_NORMALIZE && 256 <= c && c < 384) { // latin extended-A l->len += normCharcode(l->s, l->len, c - 256, ASCII_DATA_U0100, ASCII_DATA_U0100_B); } else if (c < 0x800) { // 2 byte UTF-8 l->s[l->len++] = (char) (c / 64) | 192; l->s[l->len++] = (char) (c % 64) | 128; } else if (c < 0x10000) { // 3 byte UTF-8 l->s[l->len++] = (char) (c / 0x1000) | 224; l->s[l->len++] = (char) (c % 0x1000 / 64) | 128; l->s[l->len++] = (char) (c % 64) | 128; } else { // 4-byte UTF-8 l->s[l->len++] = (char) (c / 0x40000) | 240; l->s[l->len++] = (char) (c % 0x40000 / 0x1000) | 128; l->s[l->len++] = (char) (c % 0x1000 / 64) | 128; l->s[l->len++] = (char) (c / 64) | 128; } return l; } StringLL* StringLL_next_print (StringLL *l){ StringLL *next; l->s[l->len] = 0; printf("%s\n", l->s); next = StringLL_new(); free(l->s); free(l); return next; } void StringLL_destroy (StringLL *l){ free(l->s); free(l); }