xefyl/wordenizer.c

292 lines
6.5 KiB
C
Raw Permalink Normal View History

2025-10-08 14:46:09 +02:00
/* Tokenize words, letter by letter.
* Supports Latin characters.
* Compile with gcc
*
* (c) 2021 Sakuragasaki46
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <locale.h>
#include <wctype.h>
#include <ctype.h>
// user interface
const char HELP[] = "%s - words in a text file\n\
Prints a list of words in lowercase form, one per line.\n\
Supports UTF-8 strings.\n\
\n\
Usage: %s [filename]\n\
\n\
Arguments:\n\
\tfilename\t\tfilename to analyze (defaults to stdin)\n\
\t-s\t\tcase sensitive\n\
\t-n\t\tdont normalize non-ASCII characters\n\
\t--help\t\tshows this help message and exits\n\
\t--version\t\tshow version and exit\n";
const char VERSION[] = "0.2";
// behavior switches
int cASE_SENSITIVE = 0;
int nO_NORMALIZE = 0;
// table data for normCharcode()
// if character in first table is uppercase, data from second table
// is read at same position
// latin-1
const char ASCII_DATA_192[] =
"aaaaaaAceeeeiiiiDnooooo OuuuuyTSaaaaaaAceeeeiiiiDnooooo OuuuuyTy";
const char ASCII_DATA_192_B[] =
" e h e hs e h e h ";
// latin extended A
const char ASCII_DATA_U0100[] =
"aaaaaaccCCccCCddDDeeeeeeeeYYgggggggghhhhiiiiiiiiiiIIjjkkklllllll"
"lllnnnnnnnNNooooooOOrrrrrrssSSSSSSTTttttuuuuuuOOuuuuwwyyyZZzzZZs";
const char ASCII_DATA_U0100_B[] =
" hh hh jj ee jj "
" gg ee hhhhhhss uu hh hh ";
typedef struct string_linkedlist_s {
char * s;
size_t len;
size_t bufsize;
//struct string_linkedlist_s *next;
} StringLL;
int main_file(const char* filename);
int main_stdin(void);
int tok_words(FILE *fr);
int readCharcode(FILE* fh);
int readWagonChar(FILE *fh);
int normCharcode(char *s, size_t pos, size_t offset, const char *table1, const char *table2);
StringLL* StringLL_new();
StringLL* StringLL_grow(StringLL*);
StringLL* StringLL_appendchar(StringLL*, int);
StringLL* StringLL_next_print(StringLL*);
void StringLL_destroy(StringLL*);
int main(int argc, char *argv[]) {
int curarg = 1;
// set locale, otherwise towlower() does not work
setlocale(LC_ALL, "en_US.UTF-8");
if (argc == 1) {
return main_stdin();
}
while (curarg < argc){
if (!strcmp(argv[1], "--help")) {
printf(HELP, argv[0], argv[0]);
exit(0);
} else if (!strcmp(argv[curarg], "--version")) {
puts(VERSION);
exit(0);
} else if (!strcmp(argv[curarg], "-s")) {
cASE_SENSITIVE = 1;
} else if (!strcmp(argv[curarg], "-n")) {
nO_NORMALIZE = 1;
} else if (!strcmp(argv[curarg], "-") || argv[curarg][0] == '\0') {
return main_stdin();
} else if (strncmp(argv[curarg], "-", 1)) {
return main_file(argv[curarg]);
} else {
fprintf(stderr, "Unknown option: \"%s\"\n", argv[curarg]);
return 1;
}
}
return 0;
};
int main_file(const char* filename){
FILE *fh;
fh = fopen(filename, "r");
if (!fh) {
fprintf(stderr, "[Errno %d] Could not open file \"%s\"\n", errno, filename);
exit(1);
}
return tok_words(fh);
}
int main_stdin(void){
return tok_words(stdin);
}
/**
* Main activity.
*
* @param fh a read-only file handle
*/
int tok_words(FILE *fh){
int charcode = 0, intoword = 0;
StringLL *lend;
lend = StringLL_new();
while ((charcode = readCharcode(fh)) >= 0) {
if (iswalpha(charcode)){
intoword++;
// locale lower case
if (!cASE_SENSITIVE) {
charcode = towlower(charcode);
}
lend = StringLL_appendchar(lend, charcode);
} else if (intoword > 0) {
intoword = 0;
lend = StringLL_next_print(lend);
}
}
if (intoword){
lend = StringLL_next_print(lend);
}
StringLL_destroy(lend);
return 0;
}
/**
* Read an UTF-8 character, and return its code.
* Returns a non-negative value on success,
* -1 on EOF.
*/
int readCharcode(FILE* fh){
int c, c2;
c = fgetc(fh);
if (c < 0) return -1;
if (0 <= c && c < 128) return c;
else if (192 <= c && c < 224){
c -= 192;
c *= 64;
c2 = readWagonChar(fh);
if (c2 < 0) return 0;
c += c2;
return c;
} else if (224 <= c && c < 240) {
c -= 224;
c *= 64;
c2 = readWagonChar(fh);
if (c2 < 0) return 0;
c += c2;
c *= 64;
c2 = readWagonChar(fh);
if (c2 < 0) return 0;
c += c2;
return c;
} else {
return 0;
}
}
int readWagonChar(FILE * fh){
int c;
c = fgetc(fh);
if (c < 128 || c >= 192) return -1;
return c - 128;
}
int normCharcode(char * s, size_t pos, size_t offset, const char *table1, const char *table2){
char c1;
// if character in first table is uppercase, data from second table
// is read at same position
c1 = table1[offset];
if (c1 == ' '){
return 0;
} else if (isupper(c1)) {
s[pos++] = tolower(c1);
s[pos++] = table2[offset];
return 2;
} else {
s[pos++] = c1;
return 1;
}
}
/***** StringLL functions *******/
StringLL* StringLL_new () {
StringLL* l;
l = (StringLL*) malloc (sizeof(StringLL));
l->bufsize = 16;
l->s = (char *) malloc(l->bufsize);
l->len = 0;
return l;
}
StringLL* StringLL_grow (StringLL* l){
l->bufsize *= 2;
l->s = (char*) realloc(l->s, l->bufsize);
return l;
}
StringLL* StringLL_appendchar(StringLL* l, int c){
if (c == 0) {
return l;
}
if (l->bufsize - l->len <= 4){
l = StringLL_grow(l);
}
if (c < 128){
// ascii
l->s[l->len++] = (char) c;
} else if (!nO_NORMALIZE && 192 <= c && c < 256) {
// latin-1 supplement
l->len += normCharcode(l->s, l->len, c - 192, ASCII_DATA_192, ASCII_DATA_192_B);
} else if (!nO_NORMALIZE && 256 <= c && c < 384) {
// latin extended-A
l->len += normCharcode(l->s, l->len, c - 256, ASCII_DATA_U0100, ASCII_DATA_U0100_B);
} else if (c < 0x800) {
// 2 byte UTF-8
l->s[l->len++] = (char) (c / 64) | 192;
l->s[l->len++] = (char) (c % 64) | 128;
} else if (c < 0x10000) {
// 3 byte UTF-8
l->s[l->len++] = (char) (c / 0x1000) | 224;
l->s[l->len++] = (char) (c % 0x1000 / 64) | 128;
l->s[l->len++] = (char) (c % 64) | 128;
} else {
// 4-byte UTF-8
l->s[l->len++] = (char) (c / 0x40000) | 240;
l->s[l->len++] = (char) (c % 0x40000 / 0x1000) | 128;
l->s[l->len++] = (char) (c % 0x1000 / 64) | 128;
l->s[l->len++] = (char) (c / 64) | 128;
}
return l;
}
StringLL* StringLL_next_print (StringLL *l){
StringLL *next;
l->s[l->len] = 0;
printf("%s\n", l->s);
next = StringLL_new();
free(l->s);
free(l);
return next;
}
void StringLL_destroy (StringLL *l){
free(l->s);
free(l);
}