xefyl/wordenizer.c

291 lines
6.5 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/* Tokenize words, letter by letter.
* Supports Latin characters.
* Compile with gcc
*
* (c) 2021 Sakuragasaki46
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <locale.h>
#include <wctype.h>
#include <ctype.h>
// user interface
const char HELP[] = "%s - words in a text file\n\
Prints a list of words in lowercase form, one per line.\n\
Supports UTF-8 strings.\n\
\n\
Usage: %s [filename]\n\
\n\
Arguments:\n\
\tfilename\t\tfilename to analyze (defaults to stdin)\n\
\t-s\t\tcase sensitive\n\
\t-n\t\tdont normalize non-ASCII characters\n\
\t--help\t\tshows this help message and exits\n\
\t--version\t\tshow version and exit\n";
const char VERSION[] = "0.2";
// behavior switches
int cASE_SENSITIVE = 0;
int nO_NORMALIZE = 0;
// table data for normCharcode()
// if character in first table is uppercase, data from second table
// is read at same position
// latin-1
const char ASCII_DATA_192[] =
"aaaaaaAceeeeiiiiDnooooo OuuuuyTSaaaaaaAceeeeiiiiDnooooo OuuuuyTy";
const char ASCII_DATA_192_B[] =
" e h e hs e h e h ";
// latin extended A
const char ASCII_DATA_U0100[] =
"aaaaaaccCCccCCddDDeeeeeeeeYYgggggggghhhhiiiiiiiiiiIIjjkkklllllll"
"lllnnnnnnnNNooooooOOrrrrrrssSSSSSSTTttttuuuuuuOOuuuuwwyyyZZzzZZs";
const char ASCII_DATA_U0100_B[] =
" hh hh jj ee jj "
" gg ee hhhhhhss uu hh hh ";
typedef struct string_linkedlist_s {
char * s;
size_t len;
size_t bufsize;
//struct string_linkedlist_s *next;
} StringLL;
int main_file(const char* filename);
int main_stdin(void);
int tok_words(FILE *fr);
int readCharcode(FILE* fh);
int readWagonChar(FILE *fh);
int normCharcode(char *s, size_t pos, size_t offset, const char *table1, const char *table2);
StringLL* StringLL_new();
StringLL* StringLL_grow(StringLL*);
StringLL* StringLL_appendchar(StringLL*, int);
StringLL* StringLL_next_print(StringLL*);
void StringLL_destroy(StringLL*);
int main(int argc, char *argv[]) {
int curarg = 1;
// set locale, otherwise towlower() does not work
setlocale(LC_ALL, "en_US.UTF-8");
if (argc == 1) {
return main_stdin();
}
while (curarg < argc){
if (!strcmp(argv[1], "--help")) {
printf(HELP, argv[0], argv[0]);
exit(0);
} else if (!strcmp(argv[curarg], "--version")) {
puts(VERSION);
exit(0);
} else if (!strcmp(argv[curarg], "-s")) {
cASE_SENSITIVE = 1;
} else if (!strcmp(argv[curarg], "-n")) {
nO_NORMALIZE = 1;
} else if (!strcmp(argv[curarg], "-") || argv[curarg][0] == '\0') {
return main_stdin();
} else if (strncmp(argv[curarg], "-", 1)) {
return main_file(argv[curarg]);
} else {
fprintf(stderr, "Unknown option: \"%s\"\n", argv[curarg]);
return 1;
}
}
return 0;
};
int main_file(const char* filename){
FILE *fh;
fh = fopen(filename, "r");
if (!fh) {
fprintf(stderr, "[Errno %d] Could not open file \"%s\"\n", errno, filename);
exit(1);
}
return tok_words(fh);
}
int main_stdin(void){
return tok_words(stdin);
}
/**
* Main activity.
*
* @param fh a read-only file handle
*/
int tok_words(FILE *fh){
int charcode = 0, intoword = 0;
StringLL *lend;
lend = StringLL_new();
while ((charcode = readCharcode(fh)) >= 0) {
if (iswalpha(charcode)){
intoword++;
// locale lower case
if (!cASE_SENSITIVE) {
charcode = towlower(charcode);
}
lend = StringLL_appendchar(lend, charcode);
} else if (intoword > 0) {
intoword = 0;
lend = StringLL_next_print(lend);
}
}
if (intoword){
lend = StringLL_next_print(lend);
}
StringLL_destroy(lend);
return 0;
}
/**
* Read an UTF-8 character, and return its code.
* Returns a non-negative value on success,
* -1 on EOF.
*/
int readCharcode(FILE* fh){
int c, c2;
c = fgetc(fh);
if (c < 0) return -1;
if (0 <= c && c < 128) return c;
else if (192 <= c && c < 224){
c -= 192;
c *= 64;
c2 = readWagonChar(fh);
if (c2 < 0) return 0;
c += c2;
return c;
} else if (224 <= c && c < 240) {
c -= 224;
c *= 64;
c2 = readWagonChar(fh);
if (c2 < 0) return 0;
c += c2;
c *= 64;
c2 = readWagonChar(fh);
if (c2 < 0) return 0;
c += c2;
return c;
} else {
return 0;
}
}
int readWagonChar(FILE * fh){
int c;
c = fgetc(fh);
if (c < 128 || c >= 192) return -1;
return c - 128;
}
int normCharcode(char * s, size_t pos, size_t offset, const char *table1, const char *table2){
char c1;
// if character in first table is uppercase, data from second table
// is read at same position
c1 = table1[offset];
if (c1 == ' '){
return 0;
} else if (isupper(c1)) {
s[pos++] = tolower(c1);
s[pos++] = table2[offset];
return 2;
} else {
s[pos++] = c1;
return 1;
}
}
/***** StringLL functions *******/
StringLL* StringLL_new () {
StringLL* l;
l = (StringLL*) malloc (sizeof(StringLL));
l->bufsize = 16;
l->s = (char *) malloc(l->bufsize);
l->len = 0;
return l;
}
StringLL* StringLL_grow (StringLL* l){
l->bufsize *= 2;
l->s = (char*) realloc(l->s, l->bufsize);
return l;
}
StringLL* StringLL_appendchar(StringLL* l, int c){
if (c == 0) {
return l;
}
if (l->bufsize - l->len <= 4){
l = StringLL_grow(l);
}
if (c < 128){
// ascii
l->s[l->len++] = (char) c;
} else if (!nO_NORMALIZE && 192 <= c && c < 256) {
// latin-1 supplement
l->len += normCharcode(l->s, l->len, c - 192, ASCII_DATA_192, ASCII_DATA_192_B);
} else if (!nO_NORMALIZE && 256 <= c && c < 384) {
// latin extended-A
l->len += normCharcode(l->s, l->len, c - 256, ASCII_DATA_U0100, ASCII_DATA_U0100_B);
} else if (c < 0x800) {
// 2 byte UTF-8
l->s[l->len++] = (char) (c / 64) | 192;
l->s[l->len++] = (char) (c % 64) | 128;
} else if (c < 0x10000) {
// 3 byte UTF-8
l->s[l->len++] = (char) (c / 0x1000) | 224;
l->s[l->len++] = (char) (c % 0x1000 / 64) | 128;
l->s[l->len++] = (char) (c % 64) | 128;
} else {
// 4-byte UTF-8
l->s[l->len++] = (char) (c / 0x40000) | 240;
l->s[l->len++] = (char) (c % 0x40000 / 0x1000) | 128;
l->s[l->len++] = (char) (c % 0x1000 / 64) | 128;
l->s[l->len++] = (char) (c / 64) | 128;
}
return l;
}
StringLL* StringLL_next_print (StringLL *l){
StringLL *next;
l->s[l->len] = 0;
printf("%s\n", l->s);
next = StringLL_new();
free(l->s);
free(l);
return next;
}
void StringLL_destroy (StringLL *l){
free(l->s);
free(l);
}