292 lines
6.5 KiB
C
292 lines
6.5 KiB
C
|
|
/* Tokenize words, letter by letter.
|
|||
|
|
* Supports Latin characters.
|
|||
|
|
* Compile with gcc
|
|||
|
|
*
|
|||
|
|
* (c) 2021 Sakuragasaki46
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
#include <stdio.h>
|
|||
|
|
#include <stdlib.h>
|
|||
|
|
#include <string.h>
|
|||
|
|
#include <errno.h>
|
|||
|
|
#include <locale.h>
|
|||
|
|
#include <wctype.h>
|
|||
|
|
#include <ctype.h>
|
|||
|
|
|
|||
|
|
|
|||
|
|
// user interface
|
|||
|
|
const char HELP[] = "%s - words in a text file\n\
|
|||
|
|
Prints a list of words in lowercase form, one per line.\n\
|
|||
|
|
Supports UTF-8 strings.\n\
|
|||
|
|
\n\
|
|||
|
|
Usage: %s [filename]\n\
|
|||
|
|
\n\
|
|||
|
|
Arguments:\n\
|
|||
|
|
\tfilename\t\tfilename to analyze (defaults to stdin)\n\
|
|||
|
|
\t-s\t\tcase sensitive\n\
|
|||
|
|
\t-n\t\tdon’t normalize non-ASCII characters\n\
|
|||
|
|
\t--help\t\tshows this help message and exits\n\
|
|||
|
|
\t--version\t\tshow version and exit\n";
|
|||
|
|
const char VERSION[] = "0.2";
|
|||
|
|
|
|||
|
|
// behavior switches
|
|||
|
|
int cASE_SENSITIVE = 0;
|
|||
|
|
int nO_NORMALIZE = 0;
|
|||
|
|
|
|||
|
|
// table data for normCharcode()
|
|||
|
|
// if character in first table is uppercase, data from second table
|
|||
|
|
// is read at same position
|
|||
|
|
|
|||
|
|
// latin-1
|
|||
|
|
const char ASCII_DATA_192[] =
|
|||
|
|
"aaaaaaAceeeeiiiiDnooooo OuuuuyTSaaaaaaAceeeeiiiiDnooooo OuuuuyTy";
|
|||
|
|
const char ASCII_DATA_192_B[] =
|
|||
|
|
" e h e hs e h e h ";
|
|||
|
|
|
|||
|
|
// latin extended A
|
|||
|
|
const char ASCII_DATA_U0100[] =
|
|||
|
|
"aaaaaaccCCccCCddDDeeeeeeeeYYgggggggghhhhiiiiiiiiiiIIjjkkklllllll"
|
|||
|
|
"lllnnnnnnnNNooooooOOrrrrrrssSSSSSSTTttttuuuuuuOOuuuuwwyyyZZzzZZs";
|
|||
|
|
const char ASCII_DATA_U0100_B[] =
|
|||
|
|
" hh hh jj ee jj "
|
|||
|
|
" gg ee hhhhhhss uu hh hh ";
|
|||
|
|
|
|||
|
|
|
|||
|
|
typedef struct string_linkedlist_s {
|
|||
|
|
char * s;
|
|||
|
|
size_t len;
|
|||
|
|
size_t bufsize;
|
|||
|
|
//struct string_linkedlist_s *next;
|
|||
|
|
} StringLL;
|
|||
|
|
|
|||
|
|
int main_file(const char* filename);
|
|||
|
|
int main_stdin(void);
|
|||
|
|
int tok_words(FILE *fr);
|
|||
|
|
int readCharcode(FILE* fh);
|
|||
|
|
int readWagonChar(FILE *fh);
|
|||
|
|
int normCharcode(char *s, size_t pos, size_t offset, const char *table1, const char *table2);
|
|||
|
|
|
|||
|
|
StringLL* StringLL_new();
|
|||
|
|
StringLL* StringLL_grow(StringLL*);
|
|||
|
|
StringLL* StringLL_appendchar(StringLL*, int);
|
|||
|
|
StringLL* StringLL_next_print(StringLL*);
|
|||
|
|
void StringLL_destroy(StringLL*);
|
|||
|
|
|
|||
|
|
int main(int argc, char *argv[]) {
|
|||
|
|
int curarg = 1;
|
|||
|
|
|
|||
|
|
// set locale, otherwise towlower() does not work
|
|||
|
|
setlocale(LC_ALL, "en_US.UTF-8");
|
|||
|
|
|
|||
|
|
if (argc == 1) {
|
|||
|
|
return main_stdin();
|
|||
|
|
}
|
|||
|
|
while (curarg < argc){
|
|||
|
|
if (!strcmp(argv[1], "--help")) {
|
|||
|
|
printf(HELP, argv[0], argv[0]);
|
|||
|
|
exit(0);
|
|||
|
|
} else if (!strcmp(argv[curarg], "--version")) {
|
|||
|
|
puts(VERSION);
|
|||
|
|
exit(0);
|
|||
|
|
} else if (!strcmp(argv[curarg], "-s")) {
|
|||
|
|
cASE_SENSITIVE = 1;
|
|||
|
|
} else if (!strcmp(argv[curarg], "-n")) {
|
|||
|
|
nO_NORMALIZE = 1;
|
|||
|
|
} else if (!strcmp(argv[curarg], "-") || argv[curarg][0] == '\0') {
|
|||
|
|
return main_stdin();
|
|||
|
|
} else if (strncmp(argv[curarg], "-", 1)) {
|
|||
|
|
return main_file(argv[curarg]);
|
|||
|
|
} else {
|
|||
|
|
fprintf(stderr, "Unknown option: \"%s\"\n", argv[curarg]);
|
|||
|
|
return 1;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return 0;
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
int main_file(const char* filename){
|
|||
|
|
FILE *fh;
|
|||
|
|
|
|||
|
|
fh = fopen(filename, "r");
|
|||
|
|
if (!fh) {
|
|||
|
|
fprintf(stderr, "[Errno %d] Could not open file \"%s\"\n", errno, filename);
|
|||
|
|
exit(1);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return tok_words(fh);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
int main_stdin(void){
|
|||
|
|
return tok_words(stdin);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Main activity.
|
|||
|
|
*
|
|||
|
|
* @param fh a read-only file handle
|
|||
|
|
*/
|
|||
|
|
int tok_words(FILE *fh){
|
|||
|
|
int charcode = 0, intoword = 0;
|
|||
|
|
StringLL *lend;
|
|||
|
|
|
|||
|
|
lend = StringLL_new();
|
|||
|
|
|
|||
|
|
while ((charcode = readCharcode(fh)) >= 0) {
|
|||
|
|
if (iswalpha(charcode)){
|
|||
|
|
intoword++;
|
|||
|
|
|
|||
|
|
// locale lower case
|
|||
|
|
if (!cASE_SENSITIVE) {
|
|||
|
|
charcode = towlower(charcode);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
lend = StringLL_appendchar(lend, charcode);
|
|||
|
|
} else if (intoword > 0) {
|
|||
|
|
intoword = 0;
|
|||
|
|
|
|||
|
|
lend = StringLL_next_print(lend);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (intoword){
|
|||
|
|
lend = StringLL_next_print(lend);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
StringLL_destroy(lend);
|
|||
|
|
return 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Read an UTF-8 character, and return its code.
|
|||
|
|
* Returns a non-negative value on success,
|
|||
|
|
* -1 on EOF.
|
|||
|
|
*/
|
|||
|
|
int readCharcode(FILE* fh){
|
|||
|
|
int c, c2;
|
|||
|
|
|
|||
|
|
c = fgetc(fh);
|
|||
|
|
if (c < 0) return -1;
|
|||
|
|
if (0 <= c && c < 128) return c;
|
|||
|
|
else if (192 <= c && c < 224){
|
|||
|
|
c -= 192;
|
|||
|
|
c *= 64;
|
|||
|
|
c2 = readWagonChar(fh);
|
|||
|
|
if (c2 < 0) return 0;
|
|||
|
|
c += c2;
|
|||
|
|
return c;
|
|||
|
|
} else if (224 <= c && c < 240) {
|
|||
|
|
c -= 224;
|
|||
|
|
c *= 64;
|
|||
|
|
c2 = readWagonChar(fh);
|
|||
|
|
if (c2 < 0) return 0;
|
|||
|
|
c += c2;
|
|||
|
|
c *= 64;
|
|||
|
|
c2 = readWagonChar(fh);
|
|||
|
|
if (c2 < 0) return 0;
|
|||
|
|
c += c2;
|
|||
|
|
return c;
|
|||
|
|
} else {
|
|||
|
|
return 0;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
int readWagonChar(FILE * fh){
|
|||
|
|
int c;
|
|||
|
|
|
|||
|
|
c = fgetc(fh);
|
|||
|
|
if (c < 128 || c >= 192) return -1;
|
|||
|
|
return c - 128;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
int normCharcode(char * s, size_t pos, size_t offset, const char *table1, const char *table2){
|
|||
|
|
char c1;
|
|||
|
|
|
|||
|
|
// if character in first table is uppercase, data from second table
|
|||
|
|
// is read at same position
|
|||
|
|
c1 = table1[offset];
|
|||
|
|
if (c1 == ' '){
|
|||
|
|
return 0;
|
|||
|
|
} else if (isupper(c1)) {
|
|||
|
|
s[pos++] = tolower(c1);
|
|||
|
|
s[pos++] = table2[offset];
|
|||
|
|
return 2;
|
|||
|
|
} else {
|
|||
|
|
s[pos++] = c1;
|
|||
|
|
return 1;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/***** StringLL functions *******/
|
|||
|
|
|
|||
|
|
|
|||
|
|
StringLL* StringLL_new () {
|
|||
|
|
StringLL* l;
|
|||
|
|
|
|||
|
|
l = (StringLL*) malloc (sizeof(StringLL));
|
|||
|
|
l->bufsize = 16;
|
|||
|
|
l->s = (char *) malloc(l->bufsize);
|
|||
|
|
l->len = 0;
|
|||
|
|
|
|||
|
|
return l;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
StringLL* StringLL_grow (StringLL* l){
|
|||
|
|
l->bufsize *= 2;
|
|||
|
|
l->s = (char*) realloc(l->s, l->bufsize);
|
|||
|
|
return l;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
StringLL* StringLL_appendchar(StringLL* l, int c){
|
|||
|
|
if (c == 0) {
|
|||
|
|
return l;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (l->bufsize - l->len <= 4){
|
|||
|
|
l = StringLL_grow(l);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (c < 128){
|
|||
|
|
// ascii
|
|||
|
|
l->s[l->len++] = (char) c;
|
|||
|
|
} else if (!nO_NORMALIZE && 192 <= c && c < 256) {
|
|||
|
|
// latin-1 supplement
|
|||
|
|
l->len += normCharcode(l->s, l->len, c - 192, ASCII_DATA_192, ASCII_DATA_192_B);
|
|||
|
|
} else if (!nO_NORMALIZE && 256 <= c && c < 384) {
|
|||
|
|
// latin extended-A
|
|||
|
|
l->len += normCharcode(l->s, l->len, c - 256, ASCII_DATA_U0100, ASCII_DATA_U0100_B);
|
|||
|
|
} else if (c < 0x800) {
|
|||
|
|
// 2 byte UTF-8
|
|||
|
|
l->s[l->len++] = (char) (c / 64) | 192;
|
|||
|
|
l->s[l->len++] = (char) (c % 64) | 128;
|
|||
|
|
} else if (c < 0x10000) {
|
|||
|
|
// 3 byte UTF-8
|
|||
|
|
l->s[l->len++] = (char) (c / 0x1000) | 224;
|
|||
|
|
l->s[l->len++] = (char) (c % 0x1000 / 64) | 128;
|
|||
|
|
l->s[l->len++] = (char) (c % 64) | 128;
|
|||
|
|
} else {
|
|||
|
|
// 4-byte UTF-8
|
|||
|
|
l->s[l->len++] = (char) (c / 0x40000) | 240;
|
|||
|
|
l->s[l->len++] = (char) (c % 0x40000 / 0x1000) | 128;
|
|||
|
|
l->s[l->len++] = (char) (c % 0x1000 / 64) | 128;
|
|||
|
|
l->s[l->len++] = (char) (c / 64) | 128;
|
|||
|
|
}
|
|||
|
|
return l;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
StringLL* StringLL_next_print (StringLL *l){
|
|||
|
|
StringLL *next;
|
|||
|
|
l->s[l->len] = 0;
|
|||
|
|
printf("%s\n", l->s);
|
|||
|
|
next = StringLL_new();
|
|||
|
|
free(l->s);
|
|||
|
|
free(l);
|
|||
|
|
return next;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
void StringLL_destroy (StringLL *l){
|
|||
|
|
free(l->s);
|
|||
|
|
free(l);
|
|||
|
|
}
|
|||
|
|
|