commit : bdbfac0f43647486d6fc54cf692220a607344151
Author: wps_op_zh <wps_op_zh@kingsoft.com>
Date: 2022-03-09 12:16:16
Message:
There are some bugs when detectining encodings of mi-document, so the following modifications are made.

 1. add two charset constants;
 2. correct getConfidence method of CharDistributionAnalysis refered to version 2.0.0;
 3. improve the priority of Chinese and BIG5 for multi-byte encodings;
 4. if the result of getConfidence is 0.99, ensure the encodings for ESC_ASCII,multi-byte encodings and single byte encodings;

Detect Chinese and Japanese encodings more accurately
-----------------------------------------------------------------------------------------------------------------------------------
diff --git a/src/main/java/org/mozilla/universalchardet/Constants.java b/src/main/java/org/mozilla/universalchardet/Constants.java
--- a/src/main/java/org/mozilla/universalchardet/Constants.java	(revision ba430cab541a571676a824c0e6be21042f7a8efe)
+++ b/src/main/java/org/mozilla/universalchardet/Constants.java	(revision bdbfac0f43647486d6fc54cf692220a607344151)
@@ -28,6 +28,10 @@
     public static final String CHARSET_UTF_32BE     = "UTF-32BE".intern();
     public static final String CHARSET_UTF_32LE     = "UTF-32LE".intern();
 
+    // Listed chinese charset, avoiding bugs caused by alphabe case
+    public static final String CHARSET_GBK          = "GBK".intern();
+    public static final String CHARSET_TIS_620      = "TIS-620".intern();
+
     // WARNING: Listed below are charsets which Java does not support.
     public static final String CHARSET_HZ_GB_2312   = "HZ-GB-2312".intern(); // Simplified Chinese
     public static final String CHARSET_X_ISO_10646_UCS_4_3412 = "X-ISO-10646-UCS-4-3412".intern(); // Malformed UTF-32
Index: src/main/java/org/mozilla/universalchardet/UniversalDetector.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/main/java/org/mozilla/universalchardet/UniversalDetector.java b/src/main/java/org/mozilla/universalchardet/UniversalDetector.java
--- a/src/main/java/org/mozilla/universalchardet/UniversalDetector.java	(revision ba430cab541a571676a824c0e6be21042f7a8efe)
+++ b/src/main/java/org/mozilla/universalchardet/UniversalDetector.java	(revision bdbfac0f43647486d6fc54cf692220a607344151)
@@ -214,7 +214,7 @@
                 this.escCharsetProber = new EscCharsetProber();
             }
             st = this.escCharsetProber.handleData(buf, offset, length);
-            if (st == CharsetProber.ProbingState.FOUND_IT) {
+            if (st == CharsetProber.ProbingState.FOUND_IT || 0.99f == this.escCharsetProber.getConfidence()) {
                 this.done = true;
                 this.detectedCharset = this.escCharsetProber.getCharSetName();
             }
Index: src/main/java/org/mozilla/universalchardet/prober/MBCSGroupProber.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/main/java/org/mozilla/universalchardet/prober/MBCSGroupProber.java b/src/main/java/org/mozilla/universalchardet/prober/MBCSGroupProber.java
--- a/src/main/java/org/mozilla/universalchardet/prober/MBCSGroupProber.java	(revision ba430cab541a571676a824c0e6be21042f7a8efe)
+++ b/src/main/java/org/mozilla/universalchardet/prober/MBCSGroupProber.java	(revision bdbfac0f43647486d6fc54cf692220a607344151)
@@ -61,12 +61,12 @@
         this.probers = new CharsetProber[7];
         this.isActive = new boolean[7];
         
-        this.probers[0] = new UTF8Prober();
-        this.probers[1] = new SJISProber();
-        this.probers[2] = new EUCJPProber();
-        this.probers[3] = new GB18030Prober();
-        this.probers[4] = new EUCKRProber();
-        this.probers[5] = new Big5Prober();
+        this.probers[0] = new GB18030Prober();
+        this.probers[1] = new UTF8Prober();
+        this.probers[2] = new Big5Prober();
+        this.probers[3] = new SJISProber();
+        this.probers[4] = new EUCJPProber();
+        this.probers[5] = new EUCKRProber();
         this.probers[6] = new EUCTWProber();
         
         reset();
@@ -145,7 +145,7 @@
                 continue;
             }
             st = this.probers[i].handleData(highbyteBuf, 0, highpos);
-            if (st == ProbingState.FOUND_IT) {
+            if (st == ProbingState.FOUND_IT || 0.99f == this.probers[i].getConfidence()) {
                 this.bestGuess = i;
                 this.state = ProbingState.FOUND_IT;
                 break;
Index: src/main/java/org/mozilla/universalchardet/prober/SBCSGroupProber.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/main/java/org/mozilla/universalchardet/prober/SBCSGroupProber.java b/src/main/java/org/mozilla/universalchardet/prober/SBCSGroupProber.java
--- a/src/main/java/org/mozilla/universalchardet/prober/SBCSGroupProber.java	(revision ba430cab541a571676a824c0e6be21042f7a8efe)
+++ b/src/main/java/org/mozilla/universalchardet/prober/SBCSGroupProber.java	(revision bdbfac0f43647486d6fc54cf692220a607344151)
@@ -170,7 +170,7 @@
                     continue;
                 }
                 st = this.probers[i].handleData(newbuf.array(), 0, newbuf.position());
-                if (st == ProbingState.FOUND_IT) {
+                if (st == ProbingState.FOUND_IT || 0.99f == this.probers[i].getConfidence()) {
                     this.bestGuess = i;
                     this.state = ProbingState.FOUND_IT;
                     break;
Index: src/main/java/org/mozilla/universalchardet/prober/distributionanalysis/CharDistributionAnalysis.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/main/java/org/mozilla/universalchardet/prober/distributionanalysis/CharDistributionAnalysis.java b/src/main/java/org/mozilla/universalchardet/prober/distributionanalysis/CharDistributionAnalysis.java
--- a/src/main/java/org/mozilla/universalchardet/prober/distributionanalysis/CharDistributionAnalysis.java	(revision ba430cab541a571676a824c0e6be21042f7a8efe)
+++ b/src/main/java/org/mozilla/universalchardet/prober/distributionanalysis/CharDistributionAnalysis.java	(revision bdbfac0f43647486d6fc54cf692220a607344151)
@@ -95,7 +95,7 @@
         }
         
         if (this.totalChars != this.freqChars) {
-            float r = this.freqChars / (this.totalChars - this.freqChars) * this.typicalDistributionRatio;
+            float r = this.freqChars / ((this.totalChars - this.freqChars) * this.typicalDistributionRatio);
             
             if (r < SURE_YES) {
                 return r;

