root/trunk/Subtitles/latin-detector.c

Revision 821, 1.9 kB (checked in by astrange, 7 months ago)

Now that I've replaced my latin-1 files with ACTUAL latin-1 files, regenerate the latin1/2 detector again. Also add some miscellaneous tool sources I had lying around.

Line 
1 #include <stdio.h>
2 #include <ctype.h>
3 #include <math.h>
4
5 static unsigned l1c[256] = {0}, l2c[256] = {0};
6 static unsigned long long l1ct=0, l2ct=0;
7 static unsigned l1_change[256][256] = {0}, l2_change[256][256] = {0};
8 static unsigned long long l1cht=0, l2cht=0;
9
10 static int usable(unsigned char c)
11 {
12         return isalpha(c) || (c == ' ') || (c >= 0x80);
13 }
14
15 static void count(FILE *f, unsigned one_d[256], unsigned long long *t1, unsigned two_d[256][256], unsigned long long *t2)
16 {
17         int last_c = 0, c = 0;
18        
19         while ((c = fgetc(f)) != EOF) {
20                 if (usable(c)) {
21                         one_d[c]++;
22                         (*t1)++;
23                 }
24                
25                 if (usable(last_c) || usable(c)) {
26                         two_d[last_c][c]++;
27                         (*t2)++;
28                 }
29                
30                 last_c = c;
31         }
32 }
33
34 static int diff(unsigned fr1_, unsigned long long fr1t, unsigned fr2_, unsigned long long fr2t)
35 {
36         double fr1 = ((double)fr1_) / ((double)fr1t),
37                    fr2 = ((double)fr2_) / ((double)fr2t);
38        
39         double diff = sqrt(fr2) - sqrt(fr1);
40        
41         return diff * 65536. + .5;
42 }
43
44 int main (int argc, const char * argv[]) {
45     if (argc < 3) return 1;
46         FILE *t1 = fopen(argv[1], "rb"), *t2 = fopen(argv[2], "rb");
47         FILE *data = fopen("chardet.h", "w");
48         int i, j;
49        
50         count(t1, l1c, &l1ct, l1_change, &l1cht);
51         count(t2, l2c, &l2ct, l2_change, &l2cht);
52        
53         fprintf(data, "static const short frequencies[] = {");
54        
55         for (i = 0; i < 256; i++) {
56                 if (!(i % 16)) fprintf(data,"\n\t");
57                 fprintf(data, "%d", diff(l1c[i], l1ct, l2c[i], l2ct));
58                 if (i < 255) fprintf(data, ", ");
59         }
60        
61         fprintf(data, "};\n\n");
62        
63         fprintf(data, "static const short transitions[256][256] = {\n");
64        
65         for (i = 0; i < 256; i++) {
66                 fprintf(data,"\t{ ");
67                 for (j = 0; j < 256; j++) {
68                         if (j && !(j % 16)) fprintf(data, "\n\t");
69                         fprintf(data, "%d", diff(l1_change[i][j], l1cht, l2_change[i][j], l2cht));
70                         if (j < 255) fprintf(data, ",");
71                         fprintf(data, " ");
72                 }
73                 fprintf(data, "}");
74                 if (i < 255) fprintf(data, ",\n");
75         }
76        
77         fprintf(data, "};\n");
78        
79     return 0;
80 }
Note: See TracBrowser for help on using the browser.