| 1 |
#include <stdio.h> |
|---|
| 2 |
#include <ctype.h> |
|---|
| 3 |
#include <math.h> |
|---|
| 4 |
|
|---|
| 5 |
static unsigned l1c[256] = {0}, l2c[256] = {0}; |
|---|
| 6 |
static unsigned long long l1ct=0, l2ct=0; |
|---|
| 7 |
static unsigned l1_change[256][256] = {0}, l2_change[256][256] = {0}; |
|---|
| 8 |
static unsigned long long l1cht=0, l2cht=0; |
|---|
| 9 |
|
|---|
| 10 |
static int usable(unsigned char c) |
|---|
| 11 |
{ |
|---|
| 12 |
return isalpha(c) || (c == ' ') || (c >= 0x80); |
|---|
| 13 |
} |
|---|
| 14 |
|
|---|
| 15 |
static void count(FILE *f, unsigned one_d[256], unsigned long long *t1, unsigned two_d[256][256], unsigned long long *t2) |
|---|
| 16 |
{ |
|---|
| 17 |
int last_c = 0, c = 0; |
|---|
| 18 |
|
|---|
| 19 |
while ((c = fgetc(f)) != EOF) { |
|---|
| 20 |
if (usable(c)) { |
|---|
| 21 |
one_d[c]++; |
|---|
| 22 |
(*t1)++; |
|---|
| 23 |
} |
|---|
| 24 |
|
|---|
| 25 |
if (usable(last_c) || usable(c)) { |
|---|
| 26 |
two_d[last_c][c]++; |
|---|
| 27 |
(*t2)++; |
|---|
| 28 |
} |
|---|
| 29 |
|
|---|
| 30 |
last_c = c; |
|---|
| 31 |
} |
|---|
| 32 |
} |
|---|
| 33 |
|
|---|
| 34 |
static int diff(unsigned fr1_, unsigned long long fr1t, unsigned fr2_, unsigned long long fr2t) |
|---|
| 35 |
{ |
|---|
| 36 |
double fr1 = ((double)fr1_) / ((double)fr1t), |
|---|
| 37 |
fr2 = ((double)fr2_) / ((double)fr2t); |
|---|
| 38 |
|
|---|
| 39 |
double diff = sqrt(fr2) - sqrt(fr1); |
|---|
| 40 |
|
|---|
| 41 |
return diff * 65536. + .5; |
|---|
| 42 |
} |
|---|
| 43 |
|
|---|
| 44 |
int main (int argc, const char * argv[]) { |
|---|
| 45 |
if (argc < 3) return 1; |
|---|
| 46 |
FILE *t1 = fopen(argv[1], "rb"), *t2 = fopen(argv[2], "rb"); |
|---|
| 47 |
FILE *data = fopen("chardet.h", "w"); |
|---|
| 48 |
int i, j; |
|---|
| 49 |
|
|---|
| 50 |
count(t1, l1c, &l1ct, l1_change, &l1cht); |
|---|
| 51 |
count(t2, l2c, &l2ct, l2_change, &l2cht); |
|---|
| 52 |
|
|---|
| 53 |
fprintf(data, "static const short frequencies[] = {"); |
|---|
| 54 |
|
|---|
| 55 |
for (i = 0; i < 256; i++) { |
|---|
| 56 |
if (!(i % 16)) fprintf(data,"\n\t"); |
|---|
| 57 |
fprintf(data, "%d", diff(l1c[i], l1ct, l2c[i], l2ct)); |
|---|
| 58 |
if (i < 255) fprintf(data, ", "); |
|---|
| 59 |
} |
|---|
| 60 |
|
|---|
| 61 |
fprintf(data, "};\n\n"); |
|---|
| 62 |
|
|---|
| 63 |
fprintf(data, "static const short transitions[256][256] = {\n"); |
|---|
| 64 |
|
|---|
| 65 |
for (i = 0; i < 256; i++) { |
|---|
| 66 |
fprintf(data,"\t{ "); |
|---|
| 67 |
for (j = 0; j < 256; j++) { |
|---|
| 68 |
if (j && !(j % 16)) fprintf(data, "\n\t"); |
|---|
| 69 |
fprintf(data, "%d", diff(l1_change[i][j], l1cht, l2_change[i][j], l2cht)); |
|---|
| 70 |
if (j < 255) fprintf(data, ","); |
|---|
| 71 |
fprintf(data, " "); |
|---|
| 72 |
} |
|---|
| 73 |
fprintf(data, "}"); |
|---|
| 74 |
if (i < 255) fprintf(data, ",\n"); |
|---|
| 75 |
} |
|---|
| 76 |
|
|---|
| 77 |
fprintf(data, "};\n"); |
|---|
| 78 |
|
|---|
| 79 |
return 0; |
|---|
| 80 |
} |
|---|