Changeset 495

Show
Ignore:
Timestamp:
05/05/07 13:52:50 (3 years ago)
Author:
astrange
Message:

Hack to possibly recognize some Latin-2 subtitles better. Refs #172

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/Categories.m

    r406 r495  
    8888        NSData *data = [NSData dataWithContentsOfMappedFile:file]; 
    8989        UniversalDetector *ud = [[UniversalDetector alloc] init]; 
    90         NSString *res; 
    91         CFStringEncoding enc; 
     90        NSString *res = nil; 
     91        NSStringEncoding enc; 
     92        float conf; 
     93        NSString *enc_str; 
    9294         
    9395        [ud analyzeData:data]; 
    9496         
    9597        enc = [ud encoding]; 
     98        conf = [ud confidence]; 
     99        enc_str = [ud MIMECharset]; 
    96100         
    97         if ([ud confidence] < .7)  
    98                 Codecprintf(NULL,"Guessed encoding \"%s\" for \"%s\", but not sure (confidence %f%%).\n",[[ud MIMECharset] UTF8String],[file UTF8String],[ud confidence]*100.); 
    99                  
     101        if (conf < .5) { 
     102                if ([enc_str isEqualToString:@"windows-1251"]) { // this may or may not be a good idea... 
     103                        enc = NSWindowsCP1250StringEncoding; // UD is bad at guessing latin2, so if it has a poor match for 1251 we change it to this 
     104            Codecprintf(NULL,"Guessed encoding \"%s\" for \"%s\", but confidence only %f%%. Trying windows-1250.\n",[enc_str UTF8String],[file UTF8String],conf*100.); 
     105                } else if (![enc_str isEqualToString:@"US-ASCII"]) Codecprintf(NULL,"Guessed encoding \"%s\" for \"%s\", but not sure (confidence %f%%).\n",[enc_str UTF8String],[file UTF8String],conf*100.); 
     106        } 
     107         
    100108        res = [[[NSString alloc] initWithData:data encoding:enc] autorelease]; 
    101109         
    102         if (!res) Codecprintf(NULL,"Failed to load file as guessed encoding %s.\n",[[ud MIMECharset] UTF8String]); 
     110        if (!res) Codecprintf(NULL,"Failed to load file as guessed encoding %s.\n",[enc_str UTF8String]); 
    103111        [ud release]; 
    104  
     112         
    105113        return res; 
    106114} 
  • trunk/UniversalDetector/universalchardet/src/nsCharSetProber.h

    r406 r495  
    4141#include "nscore.h" 
    4242 
    43 //#define DEBUG_chardet // Uncomment this for debug dump. 
     43#define DEBUG_chardet // Uncomment this for debug dump. 
    4444 
    4545typedef enum { 
  • trunk/UniversalDetector/universalchardet/src/nsSBCSGroupProber.cpp

    r406 r495  
    218218      mProbers[i]->DumpStatus(); 
    219219  } 
    220   printf(" SBCS Group found best match [%s] confidence %f.\r\n",   
     220  if (mProbers[mBestGuess]) printf(" SBCS Group found best match [%s] confidence %f.\r\n",   
    221221         mProbers[mBestGuess]->GetCharSetName(), cf); 
    222222}