update encoding detection

This commit is contained in:
Luke Pulverenti 2017-04-18 01:53:39 -04:00
parent 6a66aef608
commit 4d7d8961b4
46 changed files with 9698 additions and 224 deletions

View File

@ -1,16 +1,25 @@
using System.Text;
using System;
using System.Text;
using MediaBrowser.Model.IO;
using MediaBrowser.Model.Text;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
using MediaBrowser.Model.MediaInfo;
using MediaBrowser.Model.Logging;
using UniversalDetector;
namespace Emby.Common.Implementations.TextEncoding
{
public class TextEncoding : ITextEncoding
{
private readonly IFileSystem _fileSystem;
private readonly ILogger _logger;
public TextEncoding(IFileSystem fileSystem)
public TextEncoding(IFileSystem fileSystem, ILogger logger)
{
_fileSystem = fileSystem;
_logger = logger;
}
public Encoding GetASCIIEncoding()
@ -18,16 +27,8 @@ namespace Emby.Common.Implementations.TextEncoding
return Encoding.ASCII;
}
public Encoding GetFileEncoding(string srcFile)
private Encoding GetInitialEncoding(byte[] buffer)
{
// *** Detect byte order mark if any - otherwise assume default
var buffer = new byte[5];
using (var file = _fileSystem.OpenRead(srcFile))
{
file.Read(buffer, 0, 5);
}
if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
return Encoding.UTF8;
if (buffer[0] == 0xfe && buffer[1] == 0xff)
@ -37,7 +38,154 @@ namespace Emby.Common.Implementations.TextEncoding
if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
return Encoding.UTF7;
var result = new TextEncodingDetect().DetectEncoding(buffer, buffer.Length);
switch (result)
{
case TextEncodingDetect.CharacterEncoding.Ansi:
return Encoding.ASCII;
case TextEncodingDetect.CharacterEncoding.Ascii:
return Encoding.ASCII;
case TextEncodingDetect.CharacterEncoding.Utf16BeBom:
return Encoding.UTF32;
case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom:
return Encoding.UTF32;
case TextEncodingDetect.CharacterEncoding.Utf16LeBom:
return Encoding.UTF32;
case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom:
return Encoding.UTF32;
case TextEncodingDetect.CharacterEncoding.Utf8Bom:
return Encoding.UTF8;
case TextEncodingDetect.CharacterEncoding.Utf8Nobom:
return Encoding.UTF8;
default:
return null;
}
}
public string GetDetectedEncodingName(byte[] bytes, string language)
{
var encoding = GetInitialEncoding(bytes);
if (encoding != null && encoding.Equals(Encoding.UTF8))
{
return "utf-8";
}
var charset = DetectCharset(bytes, language);
if (!string.IsNullOrWhiteSpace(charset))
{
if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
{
return "utf-8";
}
if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase))
{
return charset;
}
}
if (!string.IsNullOrWhiteSpace(language))
{
return GetFileCharacterSetFromLanguage(language);
}
return null;
}
public Encoding GetEncodingFromCharset(string charset)
{
if (string.IsNullOrWhiteSpace(charset))
{
throw new ArgumentNullException("charset");
}
_logger.Debug("Getting encoding object for character set: {0}", charset);
try
{
return Encoding.GetEncoding(charset);
}
catch (ArgumentException)
{
charset = charset.Replace("-", string.Empty);
_logger.Debug("Getting encoding object for character set: {0}", charset);
return Encoding.GetEncoding(charset);
}
}
public Encoding GetDetectedEncoding(byte[] bytes, string language)
{
var charset = GetDetectedEncodingName(bytes, language);
return GetEncodingFromCharset(charset);
}
private string GetFileCharacterSetFromLanguage(string language)
{
// https://developer.xamarin.com/api/type/System.Text.Encoding/
switch (language.ToLower())
{
case "hun":
return "windows-1252";
case "pol":
case "cze":
case "ces":
case "slo":
case "slk":
case "slv":
case "srp":
case "hrv":
case "rum":
case "ron":
case "rup":
case "alb":
case "sqi":
return "windows-1250";
case "ara":
return "windows-1256";
case "heb":
return "windows-1255";
case "grc":
case "gre":
return "windows-1253";
case "crh":
case "ota":
case "tur":
return "windows-1254";
case "rus":
return "windows-1251";
case "vie":
return "windows-1258";
case "kor":
return "cp949";
default:
return "windows-1252";
}
}
private string DetectCharset(byte[] bytes, string language)
{
var detector = new CharsetDetector();
detector.Feed(bytes, 0, bytes.Length);
detector.DataEnd();
var charset = detector.Charset;
// This is often incorrectly indetected. If this happens, try to use other techniques instead
if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
{
if (!string.IsNullOrWhiteSpace(language))
{
return null;
}
}
return charset;
}
}
}

View File

@ -0,0 +1,414 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
namespace Emby.Common.Implementations.TextEncoding
{
// Copyright 2015-2016 Jonathan Bennett <jon@autoitscript.com>
//
// https://www.autoitscript.com
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/// <summary>
/// Credit: https://github.com/AutoIt/text-encoding-detect
/// </summary>
public class TextEncodingDetect
{
private readonly byte[] _utf16BeBom =
{
0xFE,
0xFF
};
private readonly byte[] _utf16LeBom =
{
0xFF,
0xFE
};
private readonly byte[] _utf8Bom =
{
0xEF,
0xBB,
0xBF
};
private bool _nullSuggestsBinary = true;
private double _utf16ExpectedNullPercent = 70;
private double _utf16UnexpectedNullPercent = 10;
public enum CharacterEncoding
{
None, // Unknown or binary
Ansi, // 0-255
Ascii, // 0-127
Utf8Bom, // UTF8 with BOM
Utf8Nobom, // UTF8 without BOM
Utf16LeBom, // UTF16 LE with BOM
Utf16LeNoBom, // UTF16 LE without BOM
Utf16BeBom, // UTF16-BE with BOM
Utf16BeNoBom // UTF16-BE without BOM
}
/// <summary>
/// Sets if the presence of nulls in a buffer indicate the buffer is binary data rather than text.
/// </summary>
public bool NullSuggestsBinary
{
set
{
_nullSuggestsBinary = value;
}
}
public double Utf16ExpectedNullPercent
{
set
{
if (value > 0 && value < 100)
{
_utf16ExpectedNullPercent = value;
}
}
}
public double Utf16UnexpectedNullPercent
{
set
{
if (value > 0 && value < 100)
{
_utf16UnexpectedNullPercent = value;
}
}
}
/// <summary>
/// Gets the BOM length for a given Encoding mode.
/// </summary>
/// <param name="encoding"></param>
/// <returns>The BOM length.</returns>
public static int GetBomLengthFromEncodingMode(CharacterEncoding encoding)
{
int length;
switch (encoding)
{
case CharacterEncoding.Utf16BeBom:
case CharacterEncoding.Utf16LeBom:
length = 2;
break;
case CharacterEncoding.Utf8Bom:
length = 3;
break;
default:
length = 0;
break;
}
return length;
}
/// <summary>
/// Checks for a BOM sequence in a byte buffer.
/// </summary>
/// <param name="buffer"></param>
/// <param name="size"></param>
/// <returns>Encoding type or Encoding.None if no BOM.</returns>
public CharacterEncoding CheckBom(byte[] buffer, int size)
{
// Check for BOM
if (size >= 2 && buffer[0] == _utf16LeBom[0] && buffer[1] == _utf16LeBom[1])
{
return CharacterEncoding.Utf16LeBom;
}
if (size >= 2 && buffer[0] == _utf16BeBom[0] && buffer[1] == _utf16BeBom[1])
{
return CharacterEncoding.Utf16BeBom;
}
if (size >= 3 && buffer[0] == _utf8Bom[0] && buffer[1] == _utf8Bom[1] && buffer[2] == _utf8Bom[2])
{
return CharacterEncoding.Utf8Bom;
}
return CharacterEncoding.None;
}
/// <summary>
/// Automatically detects the Encoding type of a given byte buffer.
/// </summary>
/// <param name="buffer">The byte buffer.</param>
/// <param name="size">The size of the byte buffer.</param>
/// <returns>The Encoding type or Encoding.None if unknown.</returns>
public CharacterEncoding DetectEncoding(byte[] buffer, int size)
{
// First check if we have a BOM and return that if so
CharacterEncoding encoding = CheckBom(buffer, size);
if (encoding != CharacterEncoding.None)
{
return encoding;
}
// Now check for valid UTF8
encoding = CheckUtf8(buffer, size);
if (encoding != CharacterEncoding.None)
{
return encoding;
}
// Now try UTF16
encoding = CheckUtf16NewlineChars(buffer, size);
if (encoding != CharacterEncoding.None)
{
return encoding;
}
encoding = CheckUtf16Ascii(buffer, size);
if (encoding != CharacterEncoding.None)
{
return encoding;
}
// ANSI or None (binary) then
if (!DoesContainNulls(buffer, size))
{
return CharacterEncoding.Ansi;
}
// Found a null, return based on the preference in null_suggests_binary_
return _nullSuggestsBinary ? CharacterEncoding.None : CharacterEncoding.Ansi;
}
/// <summary>
/// Checks if a buffer contains text that looks like utf16 by scanning for
/// newline chars that would be present even in non-english text.
/// </summary>
/// <param name="buffer">The byte buffer.</param>
/// <param name="size">The size of the byte buffer.</param>
/// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns>
private static CharacterEncoding CheckUtf16NewlineChars(byte[] buffer, int size)
{
if (size < 2)
{
return CharacterEncoding.None;
}
// Reduce size by 1 so we don't need to worry about bounds checking for pairs of bytes
size--;
var leControlChars = 0;
var beControlChars = 0;
uint pos = 0;
while (pos < size)
{
byte ch1 = buffer[pos++];
byte ch2 = buffer[pos++];
if (ch1 == 0)
{
if (ch2 == 0x0a || ch2 == 0x0d)
{
++beControlChars;
}
}
else if (ch2 == 0)
{
if (ch1 == 0x0a || ch1 == 0x0d)
{
++leControlChars;
}
}
// If we are getting both LE and BE control chars then this file is not utf16
if (leControlChars > 0 && beControlChars > 0)
{
return CharacterEncoding.None;
}
}
if (leControlChars > 0)
{
return CharacterEncoding.Utf16LeNoBom;
}
return beControlChars > 0 ? CharacterEncoding.Utf16BeNoBom : CharacterEncoding.None;
}
/// <summary>
/// Checks if a buffer contains any nulls. Used to check for binary vs text data.
/// </summary>
/// <param name="buffer">The byte buffer.</param>
/// <param name="size">The size of the byte buffer.</param>
private static bool DoesContainNulls(byte[] buffer, int size)
{
uint pos = 0;
while (pos < size)
{
if (buffer[pos++] == 0)
{
return true;
}
}
return false;
}
/// <summary>
/// Checks if a buffer contains text that looks like utf16. This is done based
/// on the use of nulls which in ASCII/script like text can be useful to identify.
/// </summary>
/// <param name="buffer">The byte buffer.</param>
/// <param name="size">The size of the byte buffer.</param>
/// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns>
private CharacterEncoding CheckUtf16Ascii(byte[] buffer, int size)
{
var numOddNulls = 0;
var numEvenNulls = 0;
// Get even nulls
uint pos = 0;
while (pos < size)
{
if (buffer[pos] == 0)
{
numEvenNulls++;
}
pos += 2;
}
// Get odd nulls
pos = 1;
while (pos < size)
{
if (buffer[pos] == 0)
{
numOddNulls++;
}
pos += 2;
}
double evenNullThreshold = numEvenNulls * 2.0 / size;
double oddNullThreshold = numOddNulls * 2.0 / size;
double expectedNullThreshold = _utf16ExpectedNullPercent / 100.0;
double unexpectedNullThreshold = _utf16UnexpectedNullPercent / 100.0;
// Lots of odd nulls, low number of even nulls
if (evenNullThreshold < unexpectedNullThreshold && oddNullThreshold > expectedNullThreshold)
{
return CharacterEncoding.Utf16LeNoBom;
}
// Lots of even nulls, low number of odd nulls
if (oddNullThreshold < unexpectedNullThreshold && evenNullThreshold > expectedNullThreshold)
{
return CharacterEncoding.Utf16BeNoBom;
}
// Don't know
return CharacterEncoding.None;
}
/// <summary>
/// Checks if a buffer contains valid utf8.
/// </summary>
/// <param name="buffer">The byte buffer.</param>
/// <param name="size">The size of the byte buffer.</param>
/// <returns>
/// Encoding type of Encoding.None (invalid UTF8), Encoding.Utf8NoBom (valid utf8 multibyte strings) or
/// Encoding.ASCII (data in 0.127 range).
/// </returns>
/// <returns>2</returns>
private CharacterEncoding CheckUtf8(byte[] buffer, int size)
{
// UTF8 Valid sequences
// 0xxxxxxx ASCII
// 110xxxxx 10xxxxxx 2-byte
// 1110xxxx 10xxxxxx 10xxxxxx 3-byte
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4-byte
//
// Width in UTF8
// Decimal Width
// 0-127 1 byte
// 194-223 2 bytes
// 224-239 3 bytes
// 240-244 4 bytes
//
// Subsequent chars are in the range 128-191
var onlySawAsciiRange = true;
uint pos = 0;
while (pos < size)
{
byte ch = buffer[pos++];
if (ch == 0 && _nullSuggestsBinary)
{
return CharacterEncoding.None;
}
int moreChars;
if (ch <= 127)
{
// 1 byte
moreChars = 0;
}
else if (ch >= 194 && ch <= 223)
{
// 2 Byte
moreChars = 1;
}
else if (ch >= 224 && ch <= 239)
{
// 3 Byte
moreChars = 2;
}
else if (ch >= 240 && ch <= 244)
{
// 4 Byte
moreChars = 3;
}
else
{
return CharacterEncoding.None; // Not utf8
}
// Check secondary chars are in range if we are expecting any
while (moreChars > 0 && pos < size)
{
onlySawAsciiRange = false; // Seen non-ascii chars now
ch = buffer[pos++];
if (ch < 128 || ch > 191)
{
return CharacterEncoding.None; // Not utf8
}
--moreChars;
}
}
// If we get to here then only valid UTF-8 sequences have been processed
// If we only saw chars in the range 0-127 then we can't assume UTF8 (the caller will need to decide)
return onlySawAsciiRange ? CharacterEncoding.Ascii : CharacterEncoding.Utf8Nobom;
}
}
}

View File

@ -0,0 +1,125 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
using System.IO;
namespace UniversalDetector
{
/// <summary>
/// Default implementation of charset detection interface.
/// The detector can be fed by a System.IO.Stream:
/// <example>
/// <code>
/// using (FileStream fs = File.OpenRead(filename)) {
/// CharsetDetector cdet = new CharsetDetector();
/// cdet.Feed(fs);
/// cdet.DataEnd();
/// Console.WriteLine("{0}, {1}", cdet.Charset, cdet.Confidence);
/// </code>
/// </example>
///
/// or by a byte a array:
///
/// <example>
/// <code>
/// byte[] buff = new byte[1024];
/// int read;
/// while ((read = stream.Read(buff, 0, buff.Length)) > 0 && !done)
/// Feed(buff, 0, read);
/// cdet.DataEnd();
/// Console.WriteLine("{0}, {1}", cdet.Charset, cdet.Confidence);
/// </code>
/// </example>
/// </summary>
public class CharsetDetector : Core.UniversalDetector, ICharsetDetector
{
private string charset;
private float confidence;
//public event DetectorFinished Finished;
public CharsetDetector() : base(FILTER_ALL)
{
}
public void Feed(Stream stream)
{
byte[] buff = new byte[1024];
int read;
while ((read = stream.Read(buff, 0, buff.Length)) > 0 && !done)
{
Feed(buff, 0, read);
}
}
public bool IsDone()
{
return done;
}
public override void Reset()
{
this.charset = null;
this.confidence = 0.0f;
base.Reset();
}
public string Charset {
get { return charset; }
}
public float Confidence {
get { return confidence; }
}
protected override void Report(string charset, float confidence)
{
this.charset = charset;
this.confidence = confidence;
// if (Finished != null) {
// Finished(charset, confidence);
// }
}
}
//public delegate void DetectorFinished(string charset, float confidence);
}

View File

@ -0,0 +1,106 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public class Big5Prober : CharsetProber
{
//void GetDistribution(PRUint32 aCharLen, const char* aStr);
private CodingStateMachine codingSM;
private BIG5DistributionAnalyser distributionAnalyser;
private byte[] lastChar = new byte[2];
public Big5Prober()
{
this.codingSM = new CodingStateMachine(new BIG5SMModel());
this.distributionAnalyser = new BIG5DistributionAnalyser();
this.Reset();
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
int codingState = 0;
int max = offset + len;
for (int i = offset; i < max; i++) {
codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR) {
state = ProbingState.NotMe;
break;
}
if (codingState == SMModel.ITSME) {
state = ProbingState.FoundIt;
break;
}
if (codingState == SMModel.START) {
int charLen = codingSM.CurrentCharLen;
if (i == offset) {
lastChar[1] = buf[offset];
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
} else {
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
}
}
}
lastChar[0] = buf[max-1];
if (state == ProbingState.Detecting)
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
state = ProbingState.FoundIt;
return state;
}
public override void Reset()
{
codingSM.Reset();
state = ProbingState.Detecting;
distributionAnalyser.Reset();
}
public override string GetCharsetName()
{
return "Big-5";
}
public override float GetConfidence()
{
return distributionAnalyser.GetConfidence();
}
}
}

View File

@ -0,0 +1,98 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Kohei TAKETA <k-tak@void.in> (Java port)
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public class BitPackage
{
public static int INDEX_SHIFT_4BITS = 3;
public static int INDEX_SHIFT_8BITS = 2;
public static int INDEX_SHIFT_16BITS = 1;
public static int SHIFT_MASK_4BITS = 7;
public static int SHIFT_MASK_8BITS = 3;
public static int SHIFT_MASK_16BITS = 1;
public static int BIT_SHIFT_4BITS = 2;
public static int BIT_SHIFT_8BITS = 3;
public static int BIT_SHIFT_16BITS = 4;
public static int UNIT_MASK_4BITS = 0x0000000F;
public static int UNIT_MASK_8BITS = 0x000000FF;
public static int UNIT_MASK_16BITS = 0x0000FFFF;
private int indexShift;
private int shiftMask;
private int bitShift;
private int unitMask;
private int[] data;
public BitPackage(int indexShift, int shiftMask,
int bitShift, int unitMask, int[] data)
{
this.indexShift = indexShift;
this.shiftMask = shiftMask;
this.bitShift = bitShift;
this.unitMask = unitMask;
this.data = data;
}
public static int Pack16bits(int a, int b)
{
return ((b << 16) | a);
}
public static int Pack8bits(int a, int b, int c, int d)
{
return Pack16bits((b << 8) | a, (d << 8) | c);
}
public static int Pack4bits(int a, int b, int c, int d,
int e, int f, int g, int h)
{
return Pack8bits((b << 4) | a, (d << 4) | c,
(f << 4) | e, (h << 4) | g);
}
public int Unpack(int i)
{
return (data[i >> indexShift] >>
((i & shiftMask) << bitShift)) & unitMask;
}
}
}

View File

@ -0,0 +1,191 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
using System.IO;
namespace UniversalDetector.Core
{
public enum ProbingState {
Detecting = 0, // no sure answer yet, but caller can ask for confidence
FoundIt = 1, // positive answer
NotMe = 2 // negative answer
};
public abstract class CharsetProber
{
protected const float SHORTCUT_THRESHOLD = 0.95F;
protected ProbingState state;
// ASCII codes
private const byte SPACE = 0x20;
private const byte CAPITAL_A = 0x41;
private const byte CAPITAL_Z = 0x5A;
private const byte SMALL_A = 0x61;
private const byte SMALL_Z = 0x7A;
private const byte LESS_THAN = 0x3C;
private const byte GREATER_THAN = 0x3E;
/// <summary>
/// Feed data to the prober
/// </summary>
/// <param name="buf">a buffer</param>
/// <param name="offset">offset into buffer</param>
/// <param name="len">number of bytes available into buffer</param>
/// <returns>
/// A <see cref="ProbingState"/>
/// </returns>
public abstract ProbingState HandleData(byte[] buf, int offset, int len);
/// <summary>
/// Reset prober state
/// </summary>
public abstract void Reset();
public abstract string GetCharsetName();
public abstract float GetConfidence();
public virtual ProbingState GetState()
{
return state;
}
public virtual void SetOption()
{
}
public virtual void DumpStatus()
{
}
//
// Helper functions used in the Latin1 and Group probers
//
/// <summary>
///
/// </summary>
/// <returns>filtered buffer</returns>
protected static byte[] FilterWithoutEnglishLetters(byte[] buf, int offset, int len)
{
byte[] result = null;
using (MemoryStream ms = new MemoryStream(buf.Length)) {
bool meetMSB = false;
int max = offset + len;
int prev = offset;
int cur = offset;
while (cur < max) {
byte b = buf[cur];
if ((b & 0x80) != 0) {
meetMSB = true;
} else if (b < CAPITAL_A || (b > CAPITAL_Z && b < SMALL_A)
|| b > SMALL_Z) {
if (meetMSB && cur > prev) {
ms.Write(buf, prev, cur - prev);
ms.WriteByte(SPACE);
meetMSB = false;
}
prev = cur + 1;
}
cur++;
}
if (meetMSB && cur > prev)
ms.Write(buf, prev, cur - prev);
ms.SetLength(ms.Position);
result = ms.ToArray();
}
return result;
}
/// <summary>
/// Do filtering to reduce load to probers (Remove ASCII symbols,
/// collapse spaces). This filter applies to all scripts which contain
/// both English characters and upper ASCII characters.
/// </summary>
/// <returns>a filtered copy of the input buffer</returns>
protected static byte[] FilterWithEnglishLetters(byte[] buf, int offset, int len)
{
byte[] result = null;
using (MemoryStream ms = new MemoryStream(buf.Length)) {
bool inTag = false;
int max = offset + len;
int prev = offset;
int cur = offset;
while (cur < max) {
byte b = buf[cur];
if (b == GREATER_THAN)
inTag = false;
else if (b == LESS_THAN)
inTag = true;
// it's ascii, but it's not a letter
if ((b & 0x80) == 0 && (b < CAPITAL_A || b > SMALL_Z
|| (b > CAPITAL_Z && b < SMALL_A))) {
if (cur > prev && !inTag) {
ms.Write(buf, prev, cur - prev);
ms.WriteByte(SPACE);
}
prev = cur + 1;
}
cur++;
}
// If the current segment contains more than just a symbol
// and it is not inside a tag then keep it.
if (!inTag && cur > prev)
ms.Write(buf, prev, cur - prev);
ms.SetLength(ms.Position);
result = ms.ToArray();
}
return result;
}
}
}

View File

@ -0,0 +1,149 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public static class Charsets
{
public const string ASCII = "ASCII";
public const string UTF8 = "UTF-8";
public const string UTF16_LE = "UTF-16LE";
public const string UTF16_BE = "UTF-16BE";
public const string UTF32_BE = "UTF-32BE";
public const string UTF32_LE = "UTF-32LE";
/// <summary>
/// Unusual BOM (3412 order)
/// </summary>
public const string UCS4_3412 = "X-ISO-10646-UCS-4-3412";
/// <summary>
/// Unusual BOM (2413 order)
/// </summary>
public const string UCS4_2413 = "X-ISO-10646-UCS-4-2413";
/// <summary>
/// Cyrillic (based on bulgarian and russian data)
/// </summary>
public const string WIN1251 = "windows-1251";
/// <summary>
/// Latin-1, almost identical to ISO-8859-1
/// </summary>
public const string WIN1252 = "windows-1252";
/// <summary>
/// Greek
/// </summary>
public const string WIN1253 = "windows-1253";
/// <summary>
/// Logical hebrew (includes ISO-8859-8-I and most of x-mac-hebrew)
/// </summary>
public const string WIN1255 = "windows-1255";
/// <summary>
/// Traditional chinese
/// </summary>
public const string BIG5 = "Big-5";
public const string EUCKR = "EUC-KR";
public const string EUCJP = "EUC-JP";
public const string EUCTW = "EUC-TW";
/// <summary>
/// Note: gb2312 is a subset of gb18030
/// </summary>
public const string GB18030 = "gb18030";
public const string ISO2022_JP = "ISO-2022-JP";
public const string ISO2022_CN = "ISO-2022-CN";
public const string ISO2022_KR = "ISO-2022-KR";
/// <summary>
/// Simplified chinese
/// </summary>
public const string HZ_GB_2312 = "HZ-GB-2312";
public const string SHIFT_JIS = "Shift-JIS";
public const string MAC_CYRILLIC = "x-mac-cyrillic";
public const string KOI8R = "KOI8-R";
public const string IBM855 = "IBM855";
public const string IBM866 = "IBM866";
/// <summary>
/// East-Europe. Disabled because too similar to windows-1252
/// (latin-1). Should use tri-grams models to discriminate between
/// these two charsets.
/// </summary>
public const string ISO8859_2 = "ISO-8859-2";
/// <summary>
/// Cyrillic
/// </summary>
public const string ISO8859_5 = "ISO-8859-5";
/// <summary>
/// Greek
/// </summary>
public const string ISO_8859_7 = "ISO-8859-7";
/// <summary>
/// Visual Hebrew
/// </summary>
public const string ISO8859_8 = "ISO-8859-8";
/// <summary>
/// Thai. This recognizer is not enabled yet.
/// </summary>
public const string TIS620 = "TIS620";
}
}

View File

@ -0,0 +1,90 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Kohei TAKETA <k-tak@void.in> (Java port)
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
/// <summary>
/// Parallel state machine for the Coding Scheme Method
/// </summary>
public class CodingStateMachine
{
private int currentState;
private SMModel model;
private int currentCharLen;
private int currentBytePos;
public CodingStateMachine(SMModel model)
{
this.currentState = SMModel.START;
this.model = model;
}
public int NextState(byte b)
{
// for each byte we get its class, if it is first byte,
// we also get byte length
int byteCls = model.GetClass(b);
if (currentState == SMModel.START) {
currentBytePos = 0;
currentCharLen = model.charLenTable[byteCls];
}
// from byte's class and stateTable, we get its next state
currentState = model.stateTable.Unpack(
currentState * model.ClassFactor + byteCls);
currentBytePos++;
return currentState;
}
public void Reset()
{
currentState = SMModel.START;
}
public int CurrentCharLen
{
get { return currentCharLen; }
}
public string ModelName
{
get { return model.Name; }
}
}
}

View File

@ -0,0 +1,110 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public class EUCJPProber : CharsetProber
{
private CodingStateMachine codingSM;
private EUCJPContextAnalyser contextAnalyser;
private EUCJPDistributionAnalyser distributionAnalyser;
private byte[] lastChar = new byte[2];
public EUCJPProber()
{
codingSM = new CodingStateMachine(new EUCJPSMModel());
distributionAnalyser = new EUCJPDistributionAnalyser();
contextAnalyser = new EUCJPContextAnalyser();
Reset();
}
public override string GetCharsetName()
{
return "EUC-JP";
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
int codingState;
int max = offset + len;
for (int i = offset; i < max; i++) {
codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR) {
state = ProbingState.NotMe;
break;
}
if (codingState == SMModel.ITSME) {
state = ProbingState.FoundIt;
break;
}
if (codingState == SMModel.START) {
int charLen = codingSM.CurrentCharLen;
if (i == offset) {
lastChar[1] = buf[offset];
contextAnalyser.HandleOneChar(lastChar, 0, charLen);
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
} else {
contextAnalyser.HandleOneChar(buf, i-1, charLen);
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
}
}
}
lastChar[0] = buf[max-1];
if (state == ProbingState.Detecting)
if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
state = ProbingState.FoundIt;
return state;
}
public override void Reset()
{
codingSM.Reset();
state = ProbingState.Detecting;
contextAnalyser.Reset();
distributionAnalyser.Reset();
}
public override float GetConfidence()
{
float contxtCf = contextAnalyser.GetConfidence();
float distribCf = distributionAnalyser.GetConfidence();
return (contxtCf > distribCf ? contxtCf : distribCf);
}
}
}

View File

@ -0,0 +1,107 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public class EUCKRProber : CharsetProber
{
private CodingStateMachine codingSM;
private EUCKRDistributionAnalyser distributionAnalyser;
private byte[] lastChar = new byte[2];
public EUCKRProber()
{
codingSM = new CodingStateMachine(new EUCKRSMModel());
distributionAnalyser = new EUCKRDistributionAnalyser();
Reset();
}
public override string GetCharsetName()
{
return "EUC-KR";
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
int codingState;
int max = offset + len;
for (int i = offset; i < max; i++) {
codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR) {
state = ProbingState.NotMe;
break;
}
if (codingState == SMModel.ITSME) {
state = ProbingState.FoundIt;
break;
}
if (codingState == SMModel.START) {
int charLen = codingSM.CurrentCharLen;
if (i == offset) {
lastChar[1] = buf[offset];
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
} else {
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
}
}
}
lastChar[0] = buf[max-1];
if (state == ProbingState.Detecting)
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
state = ProbingState.FoundIt;
return state;
}
public override float GetConfidence()
{
return distributionAnalyser.GetConfidence();
}
public override void Reset()
{
codingSM.Reset();
state = ProbingState.Detecting;
distributionAnalyser.Reset();
//mContextAnalyser.Reset();
}
}
}

View File

@ -0,0 +1,106 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public class EUCTWProber : CharsetProber
{
private CodingStateMachine codingSM;
private EUCTWDistributionAnalyser distributionAnalyser;
private byte[] lastChar = new byte[2];
public EUCTWProber()
{
this.codingSM = new CodingStateMachine(new EUCTWSMModel());
this.distributionAnalyser = new EUCTWDistributionAnalyser();
this.Reset();
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
int codingState;
int max = offset + len;
for (int i = 0; i < max; i++) {
codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR) {
state = ProbingState.NotMe;
break;
}
if (codingState == SMModel.ITSME) {
state = ProbingState.FoundIt;
break;
}
if (codingState == SMModel.START) {
int charLen = codingSM.CurrentCharLen;
if (i == offset) {
lastChar[1] = buf[offset];
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
} else {
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
}
}
}
lastChar[0] = buf[max-1];
if (state == ProbingState.Detecting)
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
state = ProbingState.FoundIt;
return state;
}
public override string GetCharsetName()
{
return "x-euc-tw";
}
public override void Reset()
{
codingSM.Reset();
state = ProbingState.Detecting;
distributionAnalyser.Reset();
}
public override float GetConfidence()
{
return distributionAnalyser.GetConfidence();
}
}
}

View File

@ -0,0 +1,105 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public class EscCharsetProber : CharsetProber
{
private const int CHARSETS_NUM = 4;
private string detectedCharset;
private CodingStateMachine[] codingSM;
int activeSM;
public EscCharsetProber()
{
codingSM = new CodingStateMachine[CHARSETS_NUM];
codingSM[0] = new CodingStateMachine(new HZSMModel());
codingSM[1] = new CodingStateMachine(new ISO2022CNSMModel());
codingSM[2] = new CodingStateMachine(new ISO2022JPSMModel());
codingSM[3] = new CodingStateMachine(new ISO2022KRSMModel());
Reset();
}
public override void Reset()
{
state = ProbingState.Detecting;
for (int i = 0; i < CHARSETS_NUM; i++)
codingSM[i].Reset();
activeSM = CHARSETS_NUM;
detectedCharset = null;
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
int max = offset + len;
for (int i = offset; i < max && state == ProbingState.Detecting; i++) {
for (int j = activeSM - 1; j >= 0; j--) {
// byte is feed to all active state machine
int codingState = codingSM[j].NextState(buf[i]);
if (codingState == SMModel.ERROR) {
// got negative answer for this state machine, make it inactive
activeSM--;
if (activeSM == 0) {
state = ProbingState.NotMe;
return state;
} else if (j != activeSM) {
CodingStateMachine t = codingSM[activeSM];
codingSM[activeSM] = codingSM[j];
codingSM[j] = t;
}
} else if (codingState == SMModel.ITSME) {
state = ProbingState.FoundIt;
detectedCharset = codingSM[j].ModelName;
return state;
}
}
}
return state;
}
public override string GetCharsetName()
{
return detectedCharset;
}
public override float GetConfidence()
{
return 0.99f;
}
}
}

View File

@ -0,0 +1,304 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Kohei TAKETA <k-tak@void.in> (Java port)
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/// <summary>
/// Escaped charsets state machines
/// </summary>
namespace UniversalDetector.Core
{
public class HZSMModel : SMModel
{
private readonly static int[] HZ_cls = {
BitPackage.Pack4bits(1,0,0,0,0,0,0,0), // 00 - 07
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 28 - 2f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
BitPackage.Pack4bits(0,0,0,4,0,5,2,0), // 78 - 7f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 80 - 87
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 88 - 8f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 90 - 97
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 98 - 9f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // a0 - a7
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // a8 - af
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b0 - b7
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b8 - bf
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // c0 - c7
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // c8 - cf
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // d0 - d7
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // d8 - df
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // e0 - e7
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // e8 - ef
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // f0 - f7
BitPackage.Pack4bits(1,1,1,1,1,1,1,1) // f8 - ff
};
private readonly static int[] HZ_st = {
BitPackage.Pack4bits(START, ERROR, 3, START, START, START, ERROR, ERROR),//00-07
BitPackage.Pack4bits(ERROR, ERROR, ERROR, ERROR, ITSME, ITSME, ITSME, ITSME),//08-0f
BitPackage.Pack4bits(ITSME, ITSME, ERROR, ERROR, START, START, 4, ERROR),//10-17
BitPackage.Pack4bits( 5, ERROR, 6, ERROR, 5, 5, 4, ERROR),//18-1f
BitPackage.Pack4bits( 4, ERROR, 4, 4, 4, ERROR, 4, ERROR),//20-27
BitPackage.Pack4bits( 4, ITSME, START, START, START, START, START, START) //28-2f
};
private readonly static int[] HZCharLenTable = {0, 0, 0, 0, 0, 0};
public HZSMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, HZ_cls),
6,
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, HZ_st),
HZCharLenTable, "HZ-GB-2312")
{
}
}
public class ISO2022CNSMModel : SMModel
{
private readonly static int[] ISO2022CN_cls = {
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
BitPackage.Pack4bits(0,3,0,0,0,0,0,0), // 28 - 2f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
BitPackage.Pack4bits(0,0,0,4,0,0,0,0), // 40 - 47
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
};
private readonly static int[] ISO2022CN_st = {
BitPackage.Pack4bits(START, 3,ERROR,START,START,START,START,START),//00-07
BitPackage.Pack4bits(START,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//08-0f
BitPackage.Pack4bits(ERROR,ERROR,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME),//10-17
BitPackage.Pack4bits(ITSME,ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR),//18-1f
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//20-27
BitPackage.Pack4bits( 5, 6,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//28-2f
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//30-37
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ERROR,START) //38-3f
};
private readonly static int[] ISO2022CNCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0};
public ISO2022CNSMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, ISO2022CN_cls),
9,
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, ISO2022CN_st),
ISO2022CNCharLenTable, "ISO-2022-CN")
{
}
}
public class ISO2022JPSMModel : SMModel
{
private readonly static int[] ISO2022JP_cls = {
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
BitPackage.Pack4bits(0,0,0,0,0,0,2,2), // 08 - 0f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
BitPackage.Pack4bits(0,0,0,0,7,0,0,0), // 20 - 27
BitPackage.Pack4bits(3,0,0,0,0,0,0,0), // 28 - 2f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
BitPackage.Pack4bits(6,0,4,0,8,0,0,0), // 40 - 47
BitPackage.Pack4bits(0,9,5,0,0,0,0,0), // 48 - 4f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
};
private readonly static int[] ISO2022JP_st = {
BitPackage.Pack4bits(START, 3, ERROR,START,START,START,START,START),//00-07
BitPackage.Pack4bits(START, START, ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//08-0f
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//10-17
BitPackage.Pack4bits(ITSME, ITSME, ITSME,ITSME,ITSME,ITSME,ERROR,ERROR),//18-1f
BitPackage.Pack4bits(ERROR, 5, ERROR,ERROR,ERROR, 4,ERROR,ERROR),//20-27
BitPackage.Pack4bits(ERROR, ERROR, ERROR, 6,ITSME,ERROR,ITSME,ERROR),//28-2f
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//30-37
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//38-3f
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ERROR,START,START) //40-47
};
private readonly static int[] ISO2022JPCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
public ISO2022JPSMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, ISO2022JP_cls),
10,
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, ISO2022JP_st),
ISO2022JPCharLenTable, "ISO-2022-JP")
{
}
}
public class ISO2022KRSMModel : SMModel
{
private readonly static int[] ISO2022KR_cls = {
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
BitPackage.Pack4bits(0,0,0,0,3,0,0,0), // 20 - 27
BitPackage.Pack4bits(0,4,0,0,0,0,0,0), // 28 - 2f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
BitPackage.Pack4bits(0,0,0,5,0,0,0,0), // 40 - 47
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
};
private readonly static int[] ISO2022KR_st = {
BitPackage.Pack4bits(START, 3,ERROR,START,START,START,ERROR,ERROR),//00-07
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
BitPackage.Pack4bits(ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR,ERROR),//10-17
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR, 5,ERROR,ERROR,ERROR),//18-1f
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,START,START,START,START) //20-27
};
private readonly static int[] ISO2022KRCharLenTable = {0, 0, 0, 0, 0, 0};
public ISO2022KRSMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, ISO2022KR_cls),
6,
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, ISO2022KR_st),
ISO2022KRCharLenTable, "ISO-2022-KR")
{
}
}
}

View File

@ -0,0 +1,111 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
// We use gb18030 to replace gb2312, because 18030 is a superset.
public class GB18030Prober : CharsetProber
{
private CodingStateMachine codingSM;
private GB18030DistributionAnalyser analyser;
private byte[] lastChar;
public GB18030Prober()
{
lastChar = new byte[2];
codingSM = new CodingStateMachine(new GB18030SMModel());
analyser = new GB18030DistributionAnalyser();
Reset();
}
public override string GetCharsetName()
{
return "gb18030";
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
int codingState = SMModel.START;
int max = offset + len;
for (int i = offset; i < max; i++) {
codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR) {
state = ProbingState.NotMe;
break;
}
if (codingState == SMModel.ITSME) {
state = ProbingState.FoundIt;
break;
}
if (codingState == SMModel.START) {
int charLen = codingSM.CurrentCharLen;
if (i == offset) {
lastChar[1] = buf[offset];
analyser.HandleOneChar(lastChar, 0, charLen);
} else {
analyser.HandleOneChar(buf, i-1, charLen);
}
}
}
lastChar[0] = buf[max-1];
if (state == ProbingState.Detecting) {
if (analyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
state = ProbingState.FoundIt;
}
return state;
}
public override float GetConfidence()
{
return analyser.GetConfidence();
}
public override void Reset()
{
codingSM.Reset();
state = ProbingState.Detecting;
analyser.Reset();
}
}
}

View File

@ -0,0 +1,324 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
using System;
/**
* General ideas of the Hebrew charset recognition
*
* Four main charsets exist in Hebrew:
* "ISO-8859-8" - Visual Hebrew
* "windows-1255" - Logical Hebrew
* "ISO-8859-8-I" - Logical Hebrew
* "x-mac-hebrew" - ?? Logical Hebrew ??
*
* Both "ISO" charsets use a completely identical set of code points, whereas
* "windows-1255" and "x-mac-hebrew" are two different proper supersets of
* these code points. windows-1255 defines additional characters in the range
* 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
* diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6.
* x-mac-hebrew defines similar additional code points but with a different
* mapping.
*
* As far as an average Hebrew text with no diacritics is concerned, all four
* charsets are identical with respect to code points. Meaning that for the
* main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
* (including final letters).
*
* The dominant difference between these charsets is their directionality.
* "Visual" directionality means that the text is ordered as if the renderer is
* not aware of a BIDI rendering algorithm. The renderer sees the text and
* draws it from left to right. The text itself when ordered naturally is read
* backwards. A buffer of Visual Hebrew generally looks like so:
* "[last word of first line spelled backwards] [whole line ordered backwards
* and spelled backwards] [first word of first line spelled backwards]
* [end of line] [last word of second line] ... etc' "
* adding punctuation marks, numbers and English text to visual text is
* naturally also "visual" and from left to right.
*
* "Logical" directionality means the text is ordered "naturally" according to
* the order it is read. It is the responsibility of the renderer to display
* the text from right to left. A BIDI algorithm is used to place general
* punctuation marks, numbers and English text in the text.
*
* Texts in x-mac-hebrew are almost impossible to find on the Internet. From
* what little evidence I could find, it seems that its general directionality
* is Logical.
*
* To sum up all of the above, the Hebrew probing mechanism knows about two
* charsets:
* Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are
* backwards while line order is natural. For charset recognition purposes
* the line order is unimportant (In fact, for this implementation, even
* word order is unimportant).
* Logical Hebrew - "windows-1255" - normal, naturally ordered text.
*
* "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
* specifically identified.
* "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew
* that contain special punctuation marks or diacritics is displayed with
* some unconverted characters showing as question marks. This problem might
* be corrected using another model prober for x-mac-hebrew. Due to the fact
* that x-mac-hebrew texts are so rare, writing another model prober isn't
* worth the effort and performance hit.
*
* *** The Prober ***
*
* The prober is divided between two nsSBCharSetProbers and an nsHebrewProber,
* all of which are managed, created, fed data, inquired and deleted by the
* nsSBCSGroupProber. The two nsSBCharSetProbers identify that the text is in
* fact some kind of Hebrew, Logical or Visual. The final decision about which
* one is it is made by the nsHebrewProber by combining final-letter scores
* with the scores of the two nsSBCharSetProbers to produce a final answer.
*
* The nsSBCSGroupProber is responsible for stripping the original text of HTML
* tags, English characters, numbers, low-ASCII punctuation characters, spaces
* and new lines. It reduces any sequence of such characters to a single space.
* The buffer fed to each prober in the SBCS group prober is pure text in
* high-ASCII.
* The two nsSBCharSetProbers (model probers) share the same language model:
* Win1255Model.
* The first nsSBCharSetProber uses the model normally as any other
* nsSBCharSetProber does, to recognize windows-1255, upon which this model was
* built. The second nsSBCharSetProber is told to make the pair-of-letter
* lookup in the language model backwards. This in practice exactly simulates
* a visual Hebrew model using the windows-1255 logical Hebrew model.
*
* The nsHebrewProber is not using any language model. All it does is look for
* final-letter evidence suggesting the text is either logical Hebrew or visual
* Hebrew. Disjointed from the model probers, the results of the nsHebrewProber
* alone are meaningless. nsHebrewProber always returns 0.00 as confidence
* since it never identifies a charset by itself. Instead, the pointer to the
* nsHebrewProber is passed to the model probers as a helper "Name Prober".
* When the Group prober receives a positive identification from any prober,
* it asks for the name of the charset identified. If the prober queried is a
* Hebrew model prober, the model prober forwards the call to the
* nsHebrewProber to make the final decision. In the nsHebrewProber, the
* decision is made according to the final-letters scores maintained and Both
* model probers scores. The answer is returned in the form of the name of the
* charset identified, either "windows-1255" or "ISO-8859-8".
*
*/
namespace UniversalDetector.Core
{
/// <summary>
/// This prober doesn't actually recognize a language or a charset.
/// It is a helper prober for the use of the Hebrew model probers
/// </summary>
public class HebrewProber : CharsetProber
{
// windows-1255 / ISO-8859-8 code points of interest
private const byte FINAL_KAF = 0xEA;
private const byte NORMAL_KAF = 0xEB;
private const byte FINAL_MEM = 0xED;
private const byte NORMAL_MEM = 0xEE;
private const byte FINAL_NUN = 0xEF;
private const byte NORMAL_NUN = 0xF0;
private const byte FINAL_PE = 0xF3;
private const byte NORMAL_PE = 0xF4;
private const byte FINAL_TSADI = 0xF5;
private const byte NORMAL_TSADI = 0xF6;
// Minimum Visual vs Logical final letter score difference.
// If the difference is below this, don't rely solely on the final letter score distance.
private const int MIN_FINAL_CHAR_DISTANCE = 5;
// Minimum Visual vs Logical model score difference.
// If the difference is below this, don't rely at all on the model score distance.
private const float MIN_MODEL_DISTANCE = 0.01f;
protected const string VISUAL_HEBREW_NAME = "ISO-8859-8";
protected const string LOGICAL_HEBREW_NAME = "windows-1255";
// owned by the group prober.
protected CharsetProber logicalProber, visualProber;
protected int finalCharLogicalScore, finalCharVisualScore;
// The two last bytes seen in the previous buffer.
protected byte prev, beforePrev;
public HebrewProber()
{
Reset();
}
public void SetModelProbers(CharsetProber logical, CharsetProber visual)
{
logicalProber = logical;
visualProber = visual;
}
/**
* Final letter analysis for logical-visual decision.
* Look for evidence that the received buffer is either logical Hebrew or
* visual Hebrew.
* The following cases are checked:
* 1) A word longer than 1 letter, ending with a final letter. This is an
* indication that the text is laid out "naturally" since the final letter
* really appears at the end. +1 for logical score.
* 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
* Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
* the Non-Final form of that letter. Exceptions to this rule are mentioned
* above in isNonFinal(). This is an indication that the text is laid out
* backwards. +1 for visual score
* 3) A word longer than 1 letter, starting with a final letter. Final letters
* should not appear at the beginning of a word. This is an indication that
* the text is laid out backwards. +1 for visual score.
*
* The visual score and logical score are accumulated throughout the text and
* are finally checked against each other in GetCharSetName().
* No checking for final letters in the middle of words is done since that case
* is not an indication for either Logical or Visual text.
*
* The input buffer should not contain any white spaces that are not (' ')
* or any low-ascii punctuation marks.
*/
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
// Both model probers say it's not them. No reason to continue.
if (GetState() == ProbingState.NotMe)
return ProbingState.NotMe;
int max = offset + len;
for (int i = offset; i < max; i++) {
byte b = buf[i];
// a word just ended
if (b == 0x20) {
// *(curPtr-2) was not a space so prev is not a 1 letter word
if (beforePrev != 0x20) {
// case (1) [-2:not space][-1:final letter][cur:space]
if (IsFinal(prev))
finalCharLogicalScore++;
// case (2) [-2:not space][-1:Non-Final letter][cur:space]
else if (IsNonFinal(prev))
finalCharVisualScore++;
}
} else {
// case (3) [-2:space][-1:final letter][cur:not space]
if ((beforePrev == 0x20) && (IsFinal(prev)) && (b != ' '))
++finalCharVisualScore;
}
beforePrev = prev;
prev = b;
}
// Forever detecting, till the end or until both model probers
// return NotMe (handled above).
return ProbingState.Detecting;
}
// Make the decision: is it Logical or Visual?
public override string GetCharsetName()
{
// If the final letter score distance is dominant enough, rely on it.
int finalsub = finalCharLogicalScore - finalCharVisualScore;
if (finalsub >= MIN_FINAL_CHAR_DISTANCE)
return LOGICAL_HEBREW_NAME;
if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE))
return VISUAL_HEBREW_NAME;
// It's not dominant enough, try to rely on the model scores instead.
float modelsub = logicalProber.GetConfidence() - visualProber.GetConfidence();
if (modelsub > MIN_MODEL_DISTANCE)
return LOGICAL_HEBREW_NAME;
if (modelsub < -(MIN_MODEL_DISTANCE))
return VISUAL_HEBREW_NAME;
// Still no good, back to final letter distance, maybe it'll save the day.
if (finalsub < 0)
return VISUAL_HEBREW_NAME;
// (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
return LOGICAL_HEBREW_NAME;
}
public override void Reset()
{
finalCharLogicalScore = 0;
finalCharVisualScore = 0;
prev = 0x20;
beforePrev = 0x20;
}
public override ProbingState GetState()
{
// Remain active as long as any of the model probers are active.
if (logicalProber.GetState() == ProbingState.NotMe &&
visualProber.GetState() == ProbingState.NotMe)
return ProbingState.NotMe;
return ProbingState.Detecting;
}
public override void DumpStatus()
{
//Console.WriteLine(" HEB: {0} - {1} [Logical-Visual score]", finalCharLogicalScore, finalCharVisualScore);
}
public override float GetConfidence()
{
return 0.0f;
}
protected static bool IsFinal(byte b)
{
return (b == FINAL_KAF || b == FINAL_MEM || b == FINAL_NUN
|| b == FINAL_PE || b == FINAL_TSADI);
}
protected static bool IsNonFinal(byte b)
{
// The normal Tsadi is not a good Non-Final letter due to words like
// 'lechotet' (to chat) containing an apostrophe after the tsadi. This
// apostrophe is converted to a space in FilterWithoutEnglishLetters causing
// the Non-Final tsadi to appear at an end of a word even though this is not
// the case in the original text.
// The letters Pe and Kaf rarely display a related behavior of not being a
// good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
// example legally end with a Non-Final Pe or Kaf. However, the benefit of
// these letters as Non-Final letters outweighs the damage since these words
// are quite rare.
return (b == NORMAL_KAF || b == NORMAL_MEM || b == NORMAL_NUN
|| b == NORMAL_PE);
}
}
}

View File

@ -0,0 +1,315 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public abstract class JapaneseContextAnalyser
{
protected const int CATEGORIES_NUM = 6;
protected const int ENOUGH_REL_THRESHOLD = 100;
protected const int MAX_REL_THRESHOLD = 1000;
protected const int MINIMUM_DATA_THRESHOLD = 4;
protected const float DONT_KNOW = -1.0f;
// hiragana frequency category table
// This is hiragana 2-char sequence table, the number in each cell represents its frequency category
protected static byte[,] jp2CharContext = {
{ 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,},
{ 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,},
{ 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,},
{ 0,4,0,5,0,5,0,4,0,4,5,4,4,3,5,3,5,1,5,3,4,3,4,4,3,4,3,3,4,3,5,4,4,3,5,5,3,5,5,5,3,5,5,3,4,5,5,3,1,3,2,0,3,4,0,4,2,0,4,2,1,5,3,2,3,5,0,4,0,2,0,5,4,4,5,4,5,0,4,0,0,4,4,},
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,},
{ 0,3,0,4,0,3,0,3,0,4,5,4,3,3,3,3,4,3,5,4,4,3,5,4,4,3,4,3,4,4,4,4,5,3,4,4,3,4,5,5,4,5,5,1,4,5,4,3,0,3,3,1,3,3,0,4,4,0,3,3,1,5,3,3,3,5,0,4,0,3,0,4,4,3,4,3,3,0,4,1,1,3,4,},
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,},
{ 0,4,0,3,0,3,0,4,0,3,4,4,3,2,2,1,2,1,3,1,3,3,3,3,3,4,3,1,3,3,5,3,3,0,4,3,0,5,4,3,3,5,4,4,3,4,4,5,0,1,2,0,1,2,0,2,2,0,1,0,0,5,2,2,1,4,0,3,0,1,0,4,4,3,5,4,3,0,2,1,0,4,3,},
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,},
{ 0,3,0,5,0,4,0,2,1,4,4,2,4,1,4,2,4,2,4,3,3,3,4,3,3,3,3,1,4,2,3,3,3,1,4,4,1,1,1,4,3,3,2,0,2,4,3,2,0,3,3,0,3,1,1,0,0,0,3,3,0,4,2,2,3,4,0,4,0,3,0,4,4,5,3,4,4,0,3,0,0,1,4,},
{ 1,4,0,4,0,4,0,4,0,3,5,4,4,3,4,3,5,4,3,3,4,3,5,4,4,4,4,3,4,2,4,3,3,1,5,4,3,2,4,5,4,5,5,4,4,5,4,4,0,3,2,2,3,3,0,4,3,1,3,2,1,4,3,3,4,5,0,3,0,2,0,4,5,5,4,5,4,0,4,0,0,5,4,},
{ 0,5,0,5,0,4,0,3,0,4,4,3,4,3,3,3,4,0,4,4,4,3,4,3,4,3,3,1,4,2,4,3,4,0,5,4,1,4,5,4,4,5,3,2,4,3,4,3,2,4,1,3,3,3,2,3,2,0,4,3,3,4,3,3,3,4,0,4,0,3,0,4,5,4,4,4,3,0,4,1,0,1,3,},
{ 0,3,1,4,0,3,0,2,0,3,4,4,3,1,4,2,3,3,4,3,4,3,4,3,4,4,3,2,3,1,5,4,4,1,4,4,3,5,4,4,3,5,5,4,3,4,4,3,1,2,3,1,2,2,0,3,2,0,3,1,0,5,3,3,3,4,3,3,3,3,4,4,4,4,5,4,2,0,3,3,2,4,3,},
{ 0,2,0,3,0,1,0,1,0,0,3,2,0,0,2,0,1,0,2,1,3,3,3,1,2,3,1,0,1,0,4,2,1,1,3,3,0,4,3,3,1,4,3,3,0,3,3,2,0,0,0,0,1,0,0,2,0,0,0,0,0,4,1,0,2,3,2,2,2,1,3,3,3,4,4,3,2,0,3,1,0,3,3,},
{ 0,4,0,4,0,3,0,3,0,4,4,4,3,3,3,3,3,3,4,3,4,2,4,3,4,3,3,2,4,3,4,5,4,1,4,5,3,5,4,5,3,5,4,0,3,5,5,3,1,3,3,2,2,3,0,3,4,1,3,3,2,4,3,3,3,4,0,4,0,3,0,4,5,4,4,5,3,0,4,1,0,3,4,},
{ 0,2,0,3,0,3,0,0,0,2,2,2,1,0,1,0,0,0,3,0,3,0,3,0,1,3,1,0,3,1,3,3,3,1,3,3,3,0,1,3,1,3,4,0,0,3,1,1,0,3,2,0,0,0,0,1,3,0,1,0,0,3,3,2,0,3,0,0,0,0,0,3,4,3,4,3,3,0,3,0,0,2,3,},
{ 2,3,0,3,0,2,0,1,0,3,3,4,3,1,3,1,1,1,3,1,4,3,4,3,3,3,0,0,3,1,5,4,3,1,4,3,2,5,5,4,4,4,4,3,3,4,4,4,0,2,1,1,3,2,0,1,2,0,0,1,0,4,1,3,3,3,0,3,0,1,0,4,4,4,5,5,3,0,2,0,0,4,4,},
{ 0,2,0,1,0,3,1,3,0,2,3,3,3,0,3,1,0,0,3,0,3,2,3,1,3,2,1,1,0,0,4,2,1,0,2,3,1,4,3,2,0,4,4,3,1,3,1,3,0,1,0,0,1,0,0,0,1,0,0,0,0,4,1,1,1,2,0,3,0,0,0,3,4,2,4,3,2,0,1,0,0,3,3,},
{ 0,1,0,4,0,5,0,4,0,2,4,4,2,3,3,2,3,3,5,3,3,3,4,3,4,2,3,0,4,3,3,3,4,1,4,3,2,1,5,5,3,4,5,1,3,5,4,2,0,3,3,0,1,3,0,4,2,0,1,3,1,4,3,3,3,3,0,3,0,1,0,3,4,4,4,5,5,0,3,0,1,4,5,},
{ 0,2,0,3,0,3,0,0,0,2,3,1,3,0,4,0,1,1,3,0,3,4,3,2,3,1,0,3,3,2,3,1,3,0,2,3,0,2,1,4,1,2,2,0,0,3,3,0,0,2,0,0,0,1,0,0,0,0,2,2,0,3,2,1,3,3,0,2,0,2,0,0,3,3,1,2,4,0,3,0,2,2,3,},
{ 2,4,0,5,0,4,0,4,0,2,4,4,4,3,4,3,3,3,1,2,4,3,4,3,4,4,5,0,3,3,3,3,2,0,4,3,1,4,3,4,1,4,4,3,3,4,4,3,1,2,3,0,4,2,0,4,1,0,3,3,0,4,3,3,3,4,0,4,0,2,0,3,5,3,4,5,2,0,3,0,0,4,5,},
{ 0,3,0,4,0,1,0,1,0,1,3,2,2,1,3,0,3,0,2,0,2,0,3,0,2,0,0,0,1,0,1,1,0,0,3,1,0,0,0,4,0,3,1,0,2,1,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,2,2,3,1,0,3,0,0,0,1,4,4,4,3,0,0,4,0,0,1,4,},
{ 1,4,1,5,0,3,0,3,0,4,5,4,4,3,5,3,3,4,4,3,4,1,3,3,3,3,2,1,4,1,5,4,3,1,4,4,3,5,4,4,3,5,4,3,3,4,4,4,0,3,3,1,2,3,0,3,1,0,3,3,0,5,4,4,4,4,4,4,3,3,5,4,4,3,3,5,4,0,3,2,0,4,4,},
{ 0,2,0,3,0,1,0,0,0,1,3,3,3,2,4,1,3,0,3,1,3,0,2,2,1,1,0,0,2,0,4,3,1,0,4,3,0,4,4,4,1,4,3,1,1,3,3,1,0,2,0,0,1,3,0,0,0,0,2,0,0,4,3,2,4,3,5,4,3,3,3,4,3,3,4,3,3,0,2,1,0,3,3,},
{ 0,2,0,4,0,3,0,2,0,2,5,5,3,4,4,4,4,1,4,3,3,0,4,3,4,3,1,3,3,2,4,3,0,3,4,3,0,3,4,4,2,4,4,0,4,5,3,3,2,2,1,1,1,2,0,1,5,0,3,3,2,4,3,3,3,4,0,3,0,2,0,4,4,3,5,5,0,0,3,0,2,3,3,},
{ 0,3,0,4,0,3,0,1,0,3,4,3,3,1,3,3,3,0,3,1,3,0,4,3,3,1,1,0,3,0,3,3,0,0,4,4,0,1,5,4,3,3,5,0,3,3,4,3,0,2,0,1,1,1,0,1,3,0,1,2,1,3,3,2,3,3,0,3,0,1,0,1,3,3,4,4,1,0,1,2,2,1,3,},
{ 0,1,0,4,0,4,0,3,0,1,3,3,3,2,3,1,1,0,3,0,3,3,4,3,2,4,2,0,1,0,4,3,2,0,4,3,0,5,3,3,2,4,4,4,3,3,3,4,0,1,3,0,0,1,0,0,1,0,0,0,0,4,2,3,3,3,0,3,0,0,0,4,4,4,5,3,2,0,3,3,0,3,5,},
{ 0,2,0,3,0,0,0,3,0,1,3,0,2,0,0,0,1,0,3,1,1,3,3,0,0,3,0,0,3,0,2,3,1,0,3,1,0,3,3,2,0,4,2,2,0,2,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,2,1,2,0,1,0,1,0,0,0,1,3,1,2,0,0,0,1,0,0,1,4,},
{ 0,3,0,3,0,5,0,1,0,2,4,3,1,3,3,2,1,1,5,2,1,0,5,1,2,0,0,0,3,3,2,2,3,2,4,3,0,0,3,3,1,3,3,0,2,5,3,4,0,3,3,0,1,2,0,2,2,0,3,2,0,2,2,3,3,3,0,2,0,1,0,3,4,4,2,5,4,0,3,0,0,3,5,},
{ 0,3,0,3,0,3,0,1,0,3,3,3,3,0,3,0,2,0,2,1,1,0,2,0,1,0,0,0,2,1,0,0,1,0,3,2,0,0,3,3,1,2,3,1,0,3,3,0,0,1,0,0,0,0,0,2,0,0,0,0,0,2,3,1,2,3,0,3,0,1,0,3,2,1,0,4,3,0,1,1,0,3,3,},
{ 0,4,0,5,0,3,0,3,0,4,5,5,4,3,5,3,4,3,5,3,3,2,5,3,4,4,4,3,4,3,4,5,5,3,4,4,3,4,4,5,4,4,4,3,4,5,5,4,2,3,4,2,3,4,0,3,3,1,4,3,2,4,3,3,5,5,0,3,0,3,0,5,5,5,5,4,4,0,4,0,1,4,4,},
{ 0,4,0,4,0,3,0,3,0,3,5,4,4,2,3,2,5,1,3,2,5,1,4,2,3,2,3,3,4,3,3,3,3,2,5,4,1,3,3,5,3,4,4,0,4,4,3,1,1,3,1,0,2,3,0,2,3,0,3,0,0,4,3,1,3,4,0,3,0,2,0,4,4,4,3,4,5,0,4,0,0,3,4,},
{ 0,3,0,3,0,3,1,2,0,3,4,4,3,3,3,0,2,2,4,3,3,1,3,3,3,1,1,0,3,1,4,3,2,3,4,4,2,4,4,4,3,4,4,3,2,4,4,3,1,3,3,1,3,3,0,4,1,0,2,2,1,4,3,2,3,3,5,4,3,3,5,4,4,3,3,0,4,0,3,2,2,4,4,},
{ 0,2,0,1,0,0,0,0,0,1,2,1,3,0,0,0,0,0,2,0,1,2,1,0,0,1,0,0,0,0,3,0,0,1,0,1,1,3,1,0,0,0,1,1,0,1,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,1,2,2,0,3,4,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1,},
{ 0,1,0,0,0,1,0,0,0,0,4,0,4,1,4,0,3,0,4,0,3,0,4,0,3,0,3,0,4,1,5,1,4,0,0,3,0,5,0,5,2,0,1,0,0,0,2,1,4,0,1,3,0,0,3,0,0,3,1,1,4,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,},
{ 1,4,0,5,0,3,0,2,0,3,5,4,4,3,4,3,5,3,4,3,3,0,4,3,3,3,3,3,3,2,4,4,3,1,3,4,4,5,4,4,3,4,4,1,3,5,4,3,3,3,1,2,2,3,3,1,3,1,3,3,3,5,3,3,4,5,0,3,0,3,0,3,4,3,4,4,3,0,3,0,2,4,3,},
{ 0,1,0,4,0,0,0,0,0,1,4,0,4,1,4,2,4,0,3,0,1,0,1,0,0,0,0,0,2,0,3,1,1,1,0,3,0,0,0,1,2,1,0,0,1,1,1,1,0,1,0,0,0,1,0,0,3,0,0,0,0,3,2,0,2,2,0,1,0,0,0,2,3,2,3,3,0,0,0,0,2,1,0,},
{ 0,5,1,5,0,3,0,3,0,5,4,4,5,1,5,3,3,0,4,3,4,3,5,3,4,3,3,2,4,3,4,3,3,0,3,3,1,4,4,3,4,4,4,3,4,5,5,3,2,3,1,1,3,3,1,3,1,1,3,3,2,4,5,3,3,5,0,4,0,3,0,4,4,3,5,3,3,0,3,4,0,4,3,},
{ 0,5,0,5,0,3,0,2,0,4,4,3,5,2,4,3,3,3,4,4,4,3,5,3,5,3,3,1,4,0,4,3,3,0,3,3,0,4,4,4,4,5,4,3,3,5,5,3,2,3,1,2,3,2,0,1,0,0,3,2,2,4,4,3,1,5,0,4,0,3,0,4,3,1,3,2,1,0,3,3,0,3,3,},
{ 0,4,0,5,0,5,0,4,0,4,5,5,5,3,4,3,3,2,5,4,4,3,5,3,5,3,4,0,4,3,4,4,3,2,4,4,3,4,5,4,4,5,5,0,3,5,5,4,1,3,3,2,3,3,1,3,1,0,4,3,1,4,4,3,4,5,0,4,0,2,0,4,3,4,4,3,3,0,4,0,0,5,5,},
{ 0,4,0,4,0,5,0,1,1,3,3,4,4,3,4,1,3,0,5,1,3,0,3,1,3,1,1,0,3,0,3,3,4,0,4,3,0,4,4,4,3,4,4,0,3,5,4,1,0,3,0,0,2,3,0,3,1,0,3,1,0,3,2,1,3,5,0,3,0,1,0,3,2,3,3,4,4,0,2,2,0,4,4,},
{ 2,4,0,5,0,4,0,3,0,4,5,5,4,3,5,3,5,3,5,3,5,2,5,3,4,3,3,4,3,4,5,3,2,1,5,4,3,2,3,4,5,3,4,1,2,5,4,3,0,3,3,0,3,2,0,2,3,0,4,1,0,3,4,3,3,5,0,3,0,1,0,4,5,5,5,4,3,0,4,2,0,3,5,},
{ 0,5,0,4,0,4,0,2,0,5,4,3,4,3,4,3,3,3,4,3,4,2,5,3,5,3,4,1,4,3,4,4,4,0,3,5,0,4,4,4,4,5,3,1,3,4,5,3,3,3,3,3,3,3,0,2,2,0,3,3,2,4,3,3,3,5,3,4,1,3,3,5,3,2,0,0,0,0,4,3,1,3,3,},
{ 0,1,0,3,0,3,0,1,0,1,3,3,3,2,3,3,3,0,3,0,0,0,3,1,3,0,0,0,2,2,2,3,0,0,3,2,0,1,2,4,1,3,3,0,0,3,3,3,0,1,0,0,2,1,0,0,3,0,3,1,0,3,0,0,1,3,0,2,0,1,0,3,3,1,3,3,0,0,1,1,0,3,3,},
{ 0,2,0,3,0,2,1,4,0,2,2,3,1,1,3,1,1,0,2,0,3,1,2,3,1,3,0,0,1,0,4,3,2,3,3,3,1,4,2,3,3,3,3,1,0,3,1,4,0,1,1,0,1,2,0,1,1,0,1,1,0,3,1,3,2,2,0,1,0,0,0,2,3,3,3,1,0,0,0,0,0,2,3,},
{ 0,5,0,4,0,5,0,2,0,4,5,5,3,3,4,3,3,1,5,4,4,2,4,4,4,3,4,2,4,3,5,5,4,3,3,4,3,3,5,5,4,5,5,1,3,4,5,3,1,4,3,1,3,3,0,3,3,1,4,3,1,4,5,3,3,5,0,4,0,3,0,5,3,3,1,4,3,0,4,0,1,5,3,},
{ 0,5,0,5,0,4,0,2,0,4,4,3,4,3,3,3,3,3,5,4,4,4,4,4,4,5,3,3,5,2,4,4,4,3,4,4,3,3,4,4,5,5,3,3,4,3,4,3,3,4,3,3,3,3,1,2,2,1,4,3,3,5,4,4,3,4,0,4,0,3,0,4,4,4,4,4,1,0,4,2,0,2,4,},
{ 0,4,0,4,0,3,0,1,0,3,5,2,3,0,3,0,2,1,4,2,3,3,4,1,4,3,3,2,4,1,3,3,3,0,3,3,0,0,3,3,3,5,3,3,3,3,3,2,0,2,0,0,2,0,0,2,0,0,1,0,0,3,1,2,2,3,0,3,0,2,0,4,4,3,3,4,1,0,3,0,0,2,4,},
{ 0,0,0,4,0,0,0,0,0,0,1,0,1,0,2,0,0,0,0,0,1,0,2,0,1,0,0,0,0,0,3,1,3,0,3,2,0,0,0,1,0,3,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,4,0,2,0,0,0,0,0,0,2,},
{ 0,2,1,3,0,2,0,2,0,3,3,3,3,1,3,1,3,3,3,3,3,3,4,2,2,1,2,1,4,0,4,3,1,3,3,3,2,4,3,5,4,3,3,3,3,3,3,3,0,1,3,0,2,0,0,1,0,0,1,0,0,4,2,0,2,3,0,3,3,0,3,3,4,2,3,1,4,0,1,2,0,2,3,},
{ 0,3,0,3,0,1,0,3,0,2,3,3,3,0,3,1,2,0,3,3,2,3,3,2,3,2,3,1,3,0,4,3,2,0,3,3,1,4,3,3,2,3,4,3,1,3,3,1,1,0,1,1,0,1,0,1,0,1,0,0,0,4,1,1,0,3,0,3,1,0,2,3,3,3,3,3,1,0,0,2,0,3,3,},
{ 0,0,0,0,0,0,0,0,0,0,3,0,2,0,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,3,0,3,0,3,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,2,0,2,3,0,0,0,0,0,0,0,0,3,},
{ 0,2,0,3,1,3,0,3,0,2,3,3,3,1,3,1,3,1,3,1,3,3,3,1,3,0,2,3,1,1,4,3,3,2,3,3,1,2,2,4,1,3,3,0,1,4,2,3,0,1,3,0,3,0,0,1,3,0,2,0,0,3,3,2,1,3,0,3,0,2,0,3,4,4,4,3,1,0,3,0,0,3,3,},
{ 0,2,0,1,0,2,0,0,0,1,3,2,2,1,3,0,1,1,3,0,3,2,3,1,2,0,2,0,1,1,3,3,3,0,3,3,1,1,2,3,2,3,3,1,2,3,2,0,0,1,0,0,0,0,0,0,3,0,1,0,0,2,1,2,1,3,0,3,0,0,0,3,4,4,4,3,2,0,2,0,0,2,4,},
{ 0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,2,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,3,1,0,0,0,0,0,0,0,3,},
{ 0,3,0,3,0,2,0,3,0,3,3,3,2,3,2,2,2,0,3,1,3,3,3,2,3,3,0,0,3,0,3,2,2,0,2,3,1,4,3,4,3,3,2,3,1,5,4,4,0,3,1,2,1,3,0,3,1,1,2,0,2,3,1,3,1,3,0,3,0,1,0,3,3,4,4,2,1,0,2,1,0,2,4,},
{ 0,1,0,3,0,1,0,2,0,1,4,2,5,1,4,0,2,0,2,1,3,1,4,0,2,1,0,0,2,1,4,1,1,0,3,3,0,5,1,3,2,3,3,1,0,3,2,3,0,1,0,0,0,0,0,0,1,0,0,0,0,4,0,1,0,3,0,2,0,1,0,3,3,3,4,3,3,0,0,0,0,2,3,},
{ 0,0,0,1,0,0,0,0,0,0,2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,1,0,0,0,0,0,3,},
{ 0,1,0,3,0,4,0,3,0,2,4,3,1,0,3,2,2,1,3,1,2,2,3,1,1,1,2,1,3,0,1,2,0,1,3,2,1,3,0,5,5,1,0,0,1,3,2,1,0,3,0,0,1,0,0,0,0,0,3,4,0,1,1,1,3,2,0,2,0,1,0,2,3,3,1,2,3,0,1,0,1,0,4,},
{ 0,0,0,1,0,3,0,3,0,2,2,1,0,0,4,0,3,0,3,1,3,0,3,0,3,0,1,0,3,0,3,1,3,0,3,3,0,0,1,2,1,1,1,0,1,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,2,2,1,2,0,0,2,0,0,0,0,2,3,3,3,3,0,0,0,0,1,4,},
{ 0,0,0,3,0,3,0,0,0,0,3,1,1,0,3,0,1,0,2,0,1,0,0,0,0,0,0,0,1,0,3,0,2,0,2,3,0,0,2,2,3,1,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0,2,3,},
{ 2,4,0,5,0,5,0,4,0,3,4,3,3,3,4,3,3,3,4,3,4,4,5,4,5,5,5,2,3,0,5,5,4,1,5,4,3,1,5,4,3,4,4,3,3,4,3,3,0,3,2,0,2,3,0,3,0,0,3,3,0,5,3,2,3,3,0,3,0,3,0,3,4,5,4,5,3,0,4,3,0,3,4,},
{ 0,3,0,3,0,3,0,3,0,3,3,4,3,2,3,2,3,0,4,3,3,3,3,3,3,3,3,0,3,2,4,3,3,1,3,4,3,4,4,4,3,4,4,3,2,4,4,1,0,2,0,0,1,1,0,2,0,0,3,1,0,5,3,2,1,3,0,3,0,1,2,4,3,2,4,3,3,0,3,2,0,4,4,},
{ 0,3,0,3,0,1,0,0,0,1,4,3,3,2,3,1,3,1,4,2,3,2,4,2,3,4,3,0,2,2,3,3,3,0,3,3,3,0,3,4,1,3,3,0,3,4,3,3,0,1,1,0,1,0,0,0,4,0,3,0,0,3,1,2,1,3,0,4,0,1,0,4,3,3,4,3,3,0,2,0,0,3,3,},
{ 0,3,0,4,0,1,0,3,0,3,4,3,3,0,3,3,3,1,3,1,3,3,4,3,3,3,0,0,3,1,5,3,3,1,3,3,2,5,4,3,3,4,5,3,2,5,3,4,0,1,0,0,0,0,0,2,0,0,1,1,0,4,2,2,1,3,0,3,0,2,0,4,4,3,5,3,2,0,1,1,0,3,4,},
{ 0,5,0,4,0,5,0,2,0,4,4,3,3,2,3,3,3,1,4,3,4,1,5,3,4,3,4,0,4,2,4,3,4,1,5,4,0,4,4,4,4,5,4,1,3,5,4,2,1,4,1,1,3,2,0,3,1,0,3,2,1,4,3,3,3,4,0,4,0,3,0,4,4,4,3,3,3,0,4,2,0,3,4,},
{ 1,4,0,4,0,3,0,1,0,3,3,3,1,1,3,3,2,2,3,3,1,0,3,2,2,1,2,0,3,1,2,1,2,0,3,2,0,2,2,3,3,4,3,0,3,3,1,2,0,1,1,3,1,2,0,0,3,0,1,1,0,3,2,2,3,3,0,3,0,0,0,2,3,3,4,3,3,0,1,0,0,1,4,},
{ 0,4,0,4,0,4,0,0,0,3,4,4,3,1,4,2,3,2,3,3,3,1,4,3,4,0,3,0,4,2,3,3,2,2,5,4,2,1,3,4,3,4,3,1,3,3,4,2,0,2,1,0,3,3,0,0,2,0,3,1,0,4,4,3,4,3,0,4,0,1,0,2,4,4,4,4,4,0,3,2,0,3,3,},
{ 0,0,0,1,0,4,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,3,2,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2,},
{ 0,2,0,3,0,4,0,4,0,1,3,3,3,0,4,0,2,1,2,1,1,1,2,0,3,1,1,0,1,0,3,1,0,0,3,3,2,0,1,1,0,0,0,0,0,1,0,2,0,2,2,0,3,1,0,0,1,0,1,1,0,1,2,0,3,0,0,0,0,1,0,0,3,3,4,3,1,0,1,0,3,0,2,},
{ 0,0,0,3,0,5,0,0,0,0,1,0,2,0,3,1,0,1,3,0,0,0,2,0,0,0,1,0,0,0,1,1,0,0,4,0,0,0,2,3,0,1,4,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,3,0,0,0,0,0,3,},
{ 0,2,0,5,0,5,0,1,0,2,4,3,3,2,5,1,3,2,3,3,3,0,4,1,2,0,3,0,4,0,2,2,1,1,5,3,0,0,1,4,2,3,2,0,3,3,3,2,0,2,4,1,1,2,0,1,1,0,3,1,0,1,3,1,2,3,0,2,0,0,0,1,3,5,4,4,4,0,3,0,0,1,3,},
{ 0,4,0,5,0,4,0,4,0,4,5,4,3,3,4,3,3,3,4,3,4,4,5,3,4,5,4,2,4,2,3,4,3,1,4,4,1,3,5,4,4,5,5,4,4,5,5,5,2,3,3,1,4,3,1,3,3,0,3,3,1,4,3,4,4,4,0,3,0,4,0,3,3,4,4,5,0,0,4,3,0,4,5,},
{ 0,4,0,4,0,3,0,3,0,3,4,4,4,3,3,2,4,3,4,3,4,3,5,3,4,3,2,1,4,2,4,4,3,1,3,4,2,4,5,5,3,4,5,4,1,5,4,3,0,3,2,2,3,2,1,3,1,0,3,3,3,5,3,3,3,5,4,4,2,3,3,4,3,3,3,2,1,0,3,2,1,4,3,},
{ 0,4,0,5,0,4,0,3,0,3,5,5,3,2,4,3,4,0,5,4,4,1,4,4,4,3,3,3,4,3,5,5,2,3,3,4,1,2,5,5,3,5,5,2,3,5,5,4,0,3,2,0,3,3,1,1,5,1,4,1,0,4,3,2,3,5,0,4,0,3,0,5,4,3,4,3,0,0,4,1,0,4,4,},
{ 1,3,0,4,0,2,0,2,0,2,5,5,3,3,3,3,3,0,4,2,3,4,4,4,3,4,0,0,3,4,5,4,3,3,3,3,2,5,5,4,5,5,5,4,3,5,5,5,1,3,1,0,1,0,0,3,2,0,4,2,0,5,2,3,2,4,1,3,0,3,0,4,5,4,5,4,3,0,4,2,0,5,4,},
{ 0,3,0,4,0,5,0,3,0,3,4,4,3,2,3,2,3,3,3,3,3,2,4,3,3,2,2,0,3,3,3,3,3,1,3,3,3,0,4,4,3,4,4,1,1,4,4,2,0,3,1,0,1,1,0,4,1,0,2,3,1,3,3,1,3,4,0,3,0,1,0,3,1,3,0,0,1,0,2,0,0,4,4,},
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,},
{ 0,3,0,3,0,2,0,3,0,1,5,4,3,3,3,1,4,2,1,2,3,4,4,2,4,4,5,0,3,1,4,3,4,0,4,3,3,3,2,3,2,5,3,4,3,2,2,3,0,0,3,0,2,1,0,1,2,0,0,0,0,2,1,1,3,1,0,2,0,4,0,3,4,4,4,5,2,0,2,0,0,1,3,},
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,1,1,0,0,0,4,2,1,1,0,1,0,3,2,0,0,3,1,1,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,2,0,0,0,1,4,0,4,2,1,0,0,0,0,0,1,},
{ 0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,3,1,0,0,0,2,0,2,1,0,0,1,2,1,0,1,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,3,1,0,0,0,0,0,1,0,0,2,1,0,0,0,0,0,0,0,0,2,},
{ 0,4,0,4,0,4,0,3,0,4,4,3,4,2,4,3,2,0,4,4,4,3,5,3,5,3,3,2,4,2,4,3,4,3,1,4,0,2,3,4,4,4,3,3,3,4,4,4,3,4,1,3,4,3,2,1,2,1,3,3,3,4,4,3,3,5,0,4,0,3,0,4,3,3,3,2,1,0,3,0,0,3,3,},
{ 0,4,0,3,0,3,0,3,0,3,5,5,3,3,3,3,4,3,4,3,3,3,4,4,4,3,3,3,3,4,3,5,3,3,1,3,2,4,5,5,5,5,4,3,4,5,5,3,2,2,3,3,3,3,2,3,3,1,2,3,2,4,3,3,3,4,0,4,0,2,0,4,3,2,2,1,2,0,3,0,0,4,1,},
};
// category counters, each integer counts sequence in its category
int[] relSample = new int[CATEGORIES_NUM];
// total sequence received
int totalRel;
// The order of previous char
int lastCharOrder;
// if last byte in current buffer is not the last byte of a character,
// we need to know how many byte to skip in next buffer.
int needToSkipCharNum;
// If this flag is set to true, detection is done and conclusion has
// been made
bool done;
public JapaneseContextAnalyser()
{
Reset();
}
public float GetConfidence()
{
// This is just one way to calculate confidence. It works well for me.
if (totalRel > MINIMUM_DATA_THRESHOLD)
return ((float)(totalRel - relSample[0]))/totalRel;
else
return DONT_KNOW;
}
public void HandleData(byte[] buf, int offset, int len)
{
int charLen = 0;
int max = offset + len;
if (done)
return;
// The buffer we got is byte oriented, and a character may span
// more than one buffer. In case the last one or two byte in last
// buffer is not complete, we record how many byte needed to
// complete that character and skip these bytes here. We can choose
// to record those bytes as well and analyse the character once it
// is complete, but since a character will not make much difference,
// skipping it will simplify our logic and improve performance.
for (int i = needToSkipCharNum+offset; i < max; ) {
int order = GetOrder(buf, i, out charLen);
i += charLen;
if (i > max) {
needToSkipCharNum = i - max;
lastCharOrder = -1;
} else {
if (order != -1 && lastCharOrder != -1) {
totalRel ++;
if (totalRel > MAX_REL_THRESHOLD) {
done = true;
break;
}
relSample[jp2CharContext[lastCharOrder, order]]++;
}
lastCharOrder = order;
}
}
}
public void HandleOneChar(byte[] buf, int offset, int charLen)
{
if (totalRel > MAX_REL_THRESHOLD)
done = true;
if (done)
return;
// Only 2-bytes characters are of our interest
int order = (charLen == 2) ? GetOrder(buf, offset) : -1;
if (order != -1 && lastCharOrder != -1) {
totalRel++;
// count this sequence to its category counter
relSample[jp2CharContext[lastCharOrder, order]]++;
}
lastCharOrder = order;
}
public void Reset()
{
totalRel = 0;
for (int i = 0; i < CATEGORIES_NUM; i++) {
relSample[i] = 0;
needToSkipCharNum = 0;
lastCharOrder = -1;
done = false;
}
}
protected abstract int GetOrder(byte[] buf, int offset, out int charLen);
protected abstract int GetOrder(byte[] buf, int offset);
public bool GotEnoughData()
{
return totalRel > ENOUGH_REL_THRESHOLD;
}
}
public class SJISContextAnalyser : JapaneseContextAnalyser
{
private const byte HIRAGANA_FIRST_BYTE = 0x82;
protected override int GetOrder(byte[] buf, int offset, out int charLen)
{
//find out current char's byte length
if (buf[offset] >= 0x81 && buf[offset] <= 0x9F
|| buf[offset] >= 0xe0 && buf[offset] <= 0xFC)
charLen = 2;
else
charLen = 1;
// return its order if it is hiragana
if (buf[offset] == HIRAGANA_FIRST_BYTE) {
byte low = buf[offset+1];
if (low >= 0x9F && low <= 0xF1)
return low - 0x9F;
}
return -1;
}
protected override int GetOrder(byte[] buf, int offset)
{
// We are only interested in Hiragana
if (buf[offset] == HIRAGANA_FIRST_BYTE) {
byte low = buf[offset+1];
if (low >= 0x9F && low <= 0xF1)
return low - 0x9F;
}
return -1;
}
}
public class EUCJPContextAnalyser : JapaneseContextAnalyser
{
private const byte HIRAGANA_FIRST_BYTE = 0xA4;
protected override int GetOrder(byte[] buf, int offset, out int charLen)
{
byte high = buf[offset];
//find out current char's byte length
if (high == 0x8E || high >= 0xA1 && high <= 0xFE)
charLen = 2;
else if (high == 0xBF)
charLen = 3;
else
charLen = 1;
// return its order if it is hiragana
if (high == HIRAGANA_FIRST_BYTE) {
byte low = buf[offset+1];
if (low >= 0xA1 && low <= 0xF3)
return low - 0xA1;
}
return -1;
}
protected override int GetOrder(byte[] buf, int offset)
{
// We are only interested in Hiragana
if (buf[offset] == HIRAGANA_FIRST_BYTE) {
byte low = buf[offset+1];
if (low >= 0xA1 && low <= 0xF3)
return low - 0xA1;
}
return -1;
}
}
}

View File

@ -0,0 +1,246 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public abstract class BulgarianModel : SequenceModel
{
//Model Table:
//total sequences: 100%
//first 512 sequences: 96.9392%
//first 1024 sequences:3.0618%
//rest sequences: 0.2992%
//negative sequences: 0.0020%
private static byte[] BULGARIAN_LANG_MODEL = {
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1,
0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,3,3,0,3,1,0,
0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
3,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,2,3,3,3,3,3,3,3,3,0,3,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,2,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,1,3,2,3,3,3,3,3,3,3,3,0,3,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,1,3,3,3,3,2,2,2,1,1,2,0,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,2,3,2,2,3,3,1,1,2,3,3,2,3,3,3,3,2,1,2,0,2,0,3,0,0,
0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,1,3,3,3,3,3,2,3,2,3,3,3,3,3,2,3,3,1,3,0,3,0,2,0,0,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,3,1,3,3,2,3,3,3,1,3,3,2,3,2,2,2,0,0,2,0,2,0,2,0,0,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,3,3,1,2,2,3,2,1,1,2,0,2,0,0,0,0,
1,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,2,3,3,1,2,3,2,2,2,3,3,3,3,3,2,2,3,1,2,0,2,1,2,0,0,
0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,1,3,3,3,3,3,2,3,3,3,2,3,3,2,3,2,2,2,3,1,2,0,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,3,3,3,3,1,1,1,2,2,1,3,1,3,2,2,3,0,0,1,0,1,0,1,0,0,
0,0,0,1,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,2,2,3,2,2,3,1,2,1,1,1,2,3,1,3,1,2,2,0,1,1,1,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,1,3,2,2,3,3,1,2,3,1,1,3,3,3,3,1,2,2,1,1,1,0,2,0,2,0,1,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,2,2,3,3,3,2,2,1,1,2,0,2,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,0,1,2,1,3,3,2,3,3,3,3,3,2,3,2,1,0,3,1,2,1,2,1,2,3,2,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,2,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,1,3,3,2,3,3,2,2,2,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,3,3,3,3,0,3,3,3,3,3,2,1,1,2,1,3,3,0,3,1,1,1,1,3,2,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,2,2,2,3,3,3,3,3,3,3,3,3,3,3,1,1,3,1,3,3,2,3,2,2,2,3,0,2,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,2,3,3,2,2,3,2,1,1,1,1,1,3,1,3,1,1,0,0,0,1,0,0,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,2,3,2,0,3,2,0,3,0,2,0,0,2,1,3,1,0,0,1,0,0,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,2,1,1,1,1,2,1,1,2,1,1,1,2,2,1,2,1,1,1,0,1,1,0,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,2,1,3,1,1,2,1,3,2,1,1,0,1,2,3,2,1,1,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,3,3,3,3,2,2,1,0,1,0,0,1,0,0,0,2,1,0,3,0,0,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,2,3,2,3,3,1,3,2,1,1,1,2,1,1,2,1,3,0,1,0,0,0,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,1,2,2,3,3,2,3,2,2,2,3,1,2,2,1,1,2,1,1,2,2,0,1,1,0,1,0,2,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,2,1,3,1,0,2,2,1,3,2,1,0,0,2,0,2,0,1,0,0,0,0,0,0,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,1,2,0,2,3,1,2,3,2,0,1,3,1,2,1,1,1,0,0,1,0,0,2,2,2,3,
2,2,2,2,1,2,1,1,2,2,1,1,2,0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1,1,0,1,
3,3,3,3,3,2,1,2,2,1,2,0,2,0,1,0,1,2,1,2,1,1,0,0,0,1,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
3,3,2,3,3,1,1,3,1,0,3,2,1,0,0,0,1,2,0,2,0,1,0,0,0,1,0,1,2,1,2,2,
1,1,1,1,1,1,1,2,2,2,1,1,1,1,1,1,1,0,1,2,1,1,1,0,0,0,0,0,1,1,0,0,
3,1,0,1,0,2,3,2,2,2,3,2,2,2,2,2,1,0,2,1,2,1,1,1,0,1,2,1,2,2,2,1,
1,1,2,2,2,2,1,2,1,1,0,1,2,1,2,2,2,1,1,1,0,1,1,1,1,2,0,1,0,0,0,0,
2,3,2,3,3,0,0,2,1,0,2,1,0,0,0,0,2,3,0,2,0,0,0,0,0,1,0,0,2,0,1,2,
2,1,2,1,2,2,1,1,1,2,1,1,1,0,1,2,2,1,1,1,1,1,0,1,1,1,0,0,1,2,0,0,
3,3,2,2,3,0,2,3,1,1,2,0,0,0,1,0,0,2,0,2,0,0,0,1,0,1,0,1,2,0,2,2,
1,1,1,1,2,1,0,1,2,2,2,1,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,1,1,0,0,
2,3,2,3,3,0,0,3,0,1,1,0,1,0,0,0,2,2,1,2,0,0,0,0,0,0,0,0,2,0,1,2,
2,2,1,1,1,1,1,2,2,2,1,0,2,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,
3,3,3,3,2,2,2,2,2,0,2,1,1,1,1,2,1,2,1,1,0,2,0,1,0,1,0,0,2,0,1,2,
1,1,1,1,1,1,1,2,2,1,1,0,2,0,1,0,2,0,0,1,1,1,0,0,2,0,0,0,1,1,0,0,
2,3,3,3,3,1,0,0,0,0,0,0,0,0,0,0,2,0,0,1,1,0,0,0,0,0,0,1,2,0,1,2,
2,2,2,1,1,2,1,1,2,2,2,1,2,0,1,1,1,1,1,1,0,1,1,1,1,0,0,1,1,1,0,0,
2,3,3,3,3,0,2,2,0,2,1,0,0,0,1,1,1,2,0,2,0,0,0,3,0,0,0,0,2,0,2,2,
1,1,1,2,1,2,1,1,2,2,2,1,2,0,1,1,1,0,1,1,1,1,0,2,1,0,0,0,1,1,0,0,
2,3,3,3,3,0,2,1,0,0,2,0,0,0,0,0,1,2,0,2,0,0,0,0,0,0,0,0,2,0,1,2,
1,1,1,2,1,1,1,1,2,2,2,0,1,0,1,1,1,0,0,1,1,1,0,0,1,0,0,0,0,1,0,0,
3,3,2,2,3,0,1,0,1,0,0,0,0,0,0,0,1,1,0,3,0,0,0,0,0,0,0,0,1,0,2,2,
1,1,1,1,1,2,1,1,2,2,1,2,2,1,0,1,1,1,1,1,0,1,0,0,1,0,0,0,1,1,0,0,
3,1,0,1,0,2,2,2,2,3,2,1,1,1,2,3,0,0,1,0,2,1,1,0,1,1,1,1,2,1,1,1,
1,2,2,1,2,1,2,2,1,1,0,1,2,1,2,2,1,1,1,0,0,1,1,1,2,1,0,1,0,0,0,0,
2,1,0,1,0,3,1,2,2,2,2,1,2,2,1,1,1,0,2,1,2,2,1,1,2,1,1,0,2,1,1,1,
1,2,2,2,2,2,2,2,1,2,0,1,1,0,2,1,1,1,1,1,0,0,1,1,1,1,0,1,0,0,0,0,
2,1,1,1,1,2,2,2,2,1,2,2,2,1,2,2,1,1,2,1,2,3,2,2,1,1,1,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,3,2,0,1,2,0,1,2,1,1,0,1,0,1,2,1,2,0,0,0,1,1,0,0,0,1,0,0,2,
1,1,0,0,1,1,0,1,1,1,1,0,2,0,1,1,1,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,
2,0,0,0,0,1,2,2,2,2,2,2,2,1,2,1,1,1,1,1,1,1,0,1,1,1,1,1,2,1,1,1,
1,2,2,2,2,1,1,2,1,2,1,1,1,0,2,1,2,1,1,1,0,2,1,1,1,1,0,1,0,0,0,0,
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,
1,1,0,1,0,1,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,3,2,0,0,0,0,1,0,0,0,0,0,0,1,1,0,2,0,0,0,0,0,0,0,0,1,0,1,2,
1,1,1,1,1,1,0,0,2,2,2,2,2,0,1,1,0,1,1,1,1,1,0,0,1,0,0,0,1,1,0,1,
2,3,1,2,1,0,1,1,0,2,2,2,0,0,1,0,0,1,1,1,1,0,0,0,0,0,0,0,1,0,1,2,
1,1,1,1,2,1,1,1,1,1,1,1,1,0,1,1,0,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0,
2,2,2,2,2,0,0,2,0,0,2,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,2,0,2,2,
1,1,1,1,1,0,0,1,2,1,1,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
1,2,2,2,2,0,0,2,0,1,1,0,0,0,1,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,1,1,
0,0,0,1,1,1,1,1,1,1,1,1,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
1,2,2,3,2,0,0,1,0,0,1,0,0,0,0,0,0,1,0,2,0,0,0,1,0,0,0,0,0,0,0,2,
1,1,0,0,1,0,0,0,1,1,0,0,1,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
2,1,2,2,2,1,2,1,2,2,1,1,2,1,1,1,0,1,1,1,1,2,0,1,0,1,1,1,1,0,1,1,
1,1,2,1,1,1,1,1,1,0,0,1,2,1,1,1,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,
1,0,0,1,3,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,2,1,0,0,1,0,2,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,2,0,0,1,
0,2,0,1,0,0,1,1,2,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
1,2,2,2,2,0,1,1,0,2,1,0,1,1,1,0,0,1,0,2,0,1,0,0,0,0,0,0,0,0,0,1,
0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,2,2,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
0,1,0,1,1,1,0,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
2,0,1,0,0,1,2,1,1,1,1,1,1,2,2,1,0,0,1,0,1,0,0,0,0,1,1,1,1,0,0,0,
1,1,2,1,1,1,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,1,2,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
0,1,1,0,1,1,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,
1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,2,0,0,2,0,1,0,0,1,0,0,1,
1,1,0,0,1,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,
1,1,1,1,1,1,1,2,0,0,0,0,0,0,2,1,0,1,1,0,0,1,1,1,0,1,0,0,0,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,1,1,1,1,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
};
public BulgarianModel(byte[] charToOrderMap, string name)
: base(charToOrderMap, BULGARIAN_LANG_MODEL, 0.969392f, false, name)
{
}
}
public class Latin5BulgarianModel : BulgarianModel
{
//255: Control characters that usually does not exist in any text
//254: Carriage/Return
//253: symbol (punctuation) that does not belong to word
//252: 0 - 9
// Character Mapping Table:
// this table is modified base on win1251BulgarianCharToOrderMap, so
// only number <64 is sure valid
private static byte[] LATIN5_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40
110,186,108, 91, 74,119, 84, 96,111,187,115,253,253,253,253,253, //50
253, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60
116,195, 85, 93, 97,113,196,197,198,199,200,253,253,253,253,253, //70
194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209, //80
210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225, //90
81,226,227,228,229,230,105,231,232,233,234,235,236, 45,237,238, //a0
31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, //b0
39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,239, 67,240, 60, 56, //c0
1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //d0
7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,241, 42, 16, //e0
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0
};
public Latin5BulgarianModel() : base(LATIN5_CHAR_TO_ORDER_MAP, "ISO-8859-5")
{
}
}
public class Win1251BulgarianModel : BulgarianModel
{
private static byte[] WIN1251__CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40
110,186,108, 91, 74,119, 84, 96,111,187,115,253,253,253,253,253, //50
253, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60
116,195, 85, 93, 97,113,196,197,198,199,200,253,253,253,253,253, //70
206,207,208,209,210,211,212,213,120,214,215,216,217,218,219,220, //80
221, 78, 64, 83,121, 98,117,105,222,223,224,225,226,227,228,229, //90
88,230,231,232,233,122, 89,106,234,235,236,237,238, 45,239,240, //a0
73, 80,118,114,241,242,243,244,245, 62, 58,246,247,248,249,250, //b0
31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, //c0
39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,251, 67,252, 60, 56, //d0
1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //e0
7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,253, 42, 16, //f0
};
public Win1251BulgarianModel() : base(WIN1251__CHAR_TO_ORDER_MAP, "windows-1251")
{
}
}
}

View File

@ -0,0 +1,345 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public abstract class CyrillicModel : SequenceModel
{
// Model Table:
// total sequences: 100%
// first 512 sequences: 97.6601%
// first 1024 sequences: 2.3389%
// rest sequences: 0.1237%
// negative sequences: 0.0009%
protected readonly static byte[] RUSSIAN_LANG_MODEL = {
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,2,3,2,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,2,2,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,2,3,3,1,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,2,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,3,3,3,3,3,3,3,3,3,3,2,1,
0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,3,3,2,1,
0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,2,2,2,3,1,3,3,1,3,3,3,3,2,2,3,0,2,2,2,3,3,2,1,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,3,3,3,3,3,2,2,3,2,3,3,3,2,1,2,2,0,1,2,2,2,2,2,2,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,0,2,2,3,3,2,1,2,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,3,3,1,2,3,2,2,3,2,3,3,3,3,2,2,3,0,3,2,2,3,1,1,1,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,2,3,3,3,3,2,2,2,0,3,3,3,2,2,2,2,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,2,3,2,2,0,1,3,2,1,2,2,1,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,2,1,1,3,0,1,1,1,1,2,1,1,0,2,2,2,1,2,0,1,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,3,3,2,2,2,2,1,3,2,3,2,3,2,1,2,2,0,1,1,2,1,2,1,2,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,2,2,2,2,0,2,2,2,2,3,1,1,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
3,2,3,2,2,3,3,3,3,3,3,3,3,3,1,3,2,0,0,3,3,3,3,2,3,3,3,3,2,3,2,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,3,3,3,3,3,2,2,3,3,0,2,1,0,3,2,3,2,3,0,0,1,2,0,0,1,0,1,2,1,1,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,3,0,2,3,3,3,3,2,3,3,3,3,1,2,2,0,0,2,3,2,2,2,3,2,3,2,2,3,0,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,2,3,0,2,3,2,3,0,1,2,3,3,2,0,2,3,0,0,2,3,2,2,0,1,3,1,3,2,2,1,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,3,0,2,3,3,3,3,3,3,3,3,2,1,3,2,0,0,2,2,3,3,3,2,3,3,0,2,2,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,2,3,3,2,2,2,3,3,0,0,1,1,1,1,1,2,0,0,1,1,1,1,0,1,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,2,3,3,3,3,3,3,3,0,3,2,3,3,2,3,2,0,2,1,0,1,1,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,3,3,3,2,2,2,2,3,1,3,2,3,1,1,2,1,0,2,2,2,2,1,3,1,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
2,2,3,3,3,3,3,1,2,2,1,3,1,0,3,0,0,3,0,0,0,1,1,0,1,2,1,0,0,0,0,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,2,2,1,1,3,3,3,2,2,1,2,2,3,1,1,2,0,0,2,2,1,3,0,0,2,1,1,2,1,1,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,2,3,3,3,3,1,2,2,2,1,2,1,3,3,1,1,2,1,2,1,2,2,0,2,0,0,1,1,0,1,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,3,3,3,3,3,2,1,3,2,2,3,2,0,3,2,0,3,0,1,0,1,1,0,0,1,1,1,1,0,1,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,2,3,3,3,2,2,2,3,3,1,2,1,2,1,0,1,0,1,1,0,1,0,0,2,1,1,1,0,1,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
3,1,1,2,1,2,3,3,2,2,1,2,2,3,0,2,1,0,0,2,2,3,2,1,2,2,2,2,2,3,1,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,1,1,0,1,1,2,2,1,1,3,0,0,1,3,1,1,1,0,0,0,1,0,1,1,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,1,3,3,3,2,0,0,0,2,1,0,1,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,1,0,0,2,3,2,2,2,1,2,2,2,1,2,1,0,0,1,1,1,0,2,0,1,1,1,0,0,1,1,
1,0,0,0,0,0,1,2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
2,3,3,3,3,0,0,0,0,1,0,0,0,0,3,0,1,2,1,0,0,0,0,0,0,0,1,1,0,0,1,1,
1,0,1,0,1,2,0,0,1,1,2,1,0,1,1,1,1,0,1,1,1,1,0,1,0,0,1,0,0,1,1,0,
2,2,3,2,2,2,3,1,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,0,1,0,1,1,1,0,2,1,
1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,1,1,1,0,1,1,0,
3,3,3,2,2,2,2,3,2,2,1,1,2,2,2,2,1,1,3,1,2,1,2,0,0,1,1,0,1,0,2,1,
1,1,1,1,1,2,1,0,1,1,1,1,0,1,0,0,1,1,0,0,1,0,1,0,0,1,0,0,0,1,1,0,
2,0,0,1,0,3,2,2,2,2,1,2,1,2,1,2,0,0,0,2,1,2,2,1,1,2,2,0,1,1,0,2,
1,1,1,1,1,0,1,1,1,2,1,1,1,2,1,0,1,2,1,1,1,1,0,1,1,1,0,0,1,0,0,1,
1,3,2,2,2,1,1,1,2,3,0,0,0,0,2,0,2,2,1,0,0,0,0,0,0,1,0,0,0,0,1,1,
1,0,1,1,0,1,0,1,1,0,1,1,0,2,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,
2,3,2,3,2,1,2,2,2,2,1,0,0,0,2,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,2,1,
1,1,2,1,0,2,0,0,1,0,1,0,0,1,0,0,1,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,
3,0,0,1,0,2,2,2,3,2,2,2,2,2,2,2,0,0,0,2,1,2,1,1,1,2,2,0,0,0,1,2,
1,1,1,1,1,0,1,2,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0,0,1,
2,3,2,3,3,2,0,1,1,1,0,0,1,0,2,0,1,1,3,1,0,0,0,0,0,0,0,1,0,0,2,1,
1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1,1,1,0,0,1,1,0,1,0,0,0,0,0,0,1,0,
2,3,3,3,3,1,2,2,2,2,0,1,1,0,2,1,1,1,2,1,0,1,1,0,0,1,0,1,0,0,2,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,3,3,3,2,0,0,1,1,2,2,1,0,0,2,0,1,1,3,0,0,1,0,0,0,0,0,1,0,1,2,1,
1,1,2,0,1,1,1,0,1,0,1,1,0,1,0,1,1,1,1,0,1,0,0,0,0,0,0,1,0,1,1,0,
1,3,2,3,2,1,0,0,2,2,2,0,1,0,2,0,1,1,1,0,1,0,0,0,3,0,1,1,0,0,2,1,
1,1,1,0,1,1,0,0,0,0,1,1,0,1,0,0,2,1,1,0,1,0,0,0,1,0,1,0,0,1,1,0,
3,1,2,1,1,2,2,2,2,2,2,1,2,2,1,1,0,0,0,2,2,2,0,0,0,1,2,1,0,1,0,1,
2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,2,1,1,1,0,1,0,1,1,0,1,1,1,0,0,1,
3,0,0,0,0,2,0,1,1,1,1,1,1,1,0,1,0,0,0,1,1,1,0,1,0,1,1,0,0,1,0,1,
1,1,0,0,1,0,0,0,1,0,1,1,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,
1,3,3,2,2,0,0,0,2,2,0,0,0,1,2,0,1,1,2,0,0,0,0,0,0,0,0,1,0,0,2,1,
0,1,1,0,0,1,1,0,0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,
2,3,2,3,2,0,0,0,0,1,1,0,0,0,2,0,2,0,2,0,0,0,0,0,1,0,0,1,0,0,1,1,
1,1,2,0,1,2,1,0,1,1,2,1,1,1,1,1,2,1,1,0,1,0,0,1,1,1,1,1,0,1,1,0,
1,3,2,2,2,1,0,0,2,2,1,0,1,2,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,
0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,1,0,2,3,1,2,2,2,2,2,2,1,1,0,0,0,1,0,1,0,2,1,1,1,0,0,0,0,1,
1,1,0,1,1,0,1,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
2,0,2,0,0,1,0,3,2,1,2,1,2,2,0,1,0,0,0,2,1,0,0,2,1,1,1,1,0,2,0,2,
2,1,1,1,1,1,1,1,1,1,1,1,1,2,1,0,1,1,1,1,0,0,0,1,1,1,1,0,1,0,0,1,
1,2,2,2,2,1,0,0,1,0,0,0,0,0,2,0,1,1,1,1,0,0,0,0,1,0,1,2,0,0,2,0,
1,0,1,1,1,2,1,0,1,0,1,1,0,0,1,0,1,1,1,0,1,0,0,0,1,0,0,1,0,1,1,0,
2,1,2,2,2,0,3,0,1,1,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
0,0,0,1,1,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,
1,2,2,3,2,2,0,0,1,1,2,0,1,2,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,
0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,
2,2,1,1,2,1,2,2,2,2,2,1,2,2,0,1,0,0,0,1,2,2,2,1,2,1,1,1,1,1,2,1,
1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,1,1,1,0,0,0,0,1,1,1,0,1,1,0,0,1,
1,2,2,2,2,0,1,0,2,2,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,
0,0,1,0,0,1,0,0,0,0,1,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,2,2,2,2,0,0,0,2,2,2,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,
0,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,2,2,2,2,0,0,0,0,1,0,0,1,1,2,0,0,0,0,1,0,1,0,0,1,0,0,2,0,0,0,1,
0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
1,2,2,2,1,1,2,0,2,1,1,1,1,0,2,2,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,
0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
1,0,2,1,2,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,
0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,
1,0,0,0,0,2,0,1,2,1,0,1,1,1,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,1,
0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,
2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
1,1,1,0,1,0,1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
1,1,0,1,1,0,1,0,1,0,0,0,0,1,1,0,1,1,0,0,0,0,0,1,0,1,1,0,1,0,0,0,
0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
};
public CyrillicModel(byte[] charToOrderMap, string name)
: base(charToOrderMap, RUSSIAN_LANG_MODEL, 0.976601f, false, name)
{
}
}
public class Koi8rModel : CyrillicModel
{
private readonly static byte[] KOI8R_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, //80
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, //90
223,224,225, 68,226,227,228,229,230,231,232,233,234,235,236,237, //a0
238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253, //b0
27, 3, 21, 28, 13, 2, 39, 19, 26, 4, 23, 11, 8, 12, 5, 1, //c0
15, 16, 9, 7, 6, 14, 24, 10, 17, 18, 20, 25, 30, 29, 22, 54, //d0
59, 37, 44, 58, 41, 48, 53, 46, 55, 42, 60, 36, 49, 38, 31, 34, //e0
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, //f0
};
public Koi8rModel() : base(KOI8R_CHAR_TO_ORDER_MAP, "KOI8-R")
{
}
}
public class Win1251Model : CyrillicModel
{
private readonly static byte[] WIN1251_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
239,240,241,242,243,244,245,246, 68,247,248,249,250,251,252,253,
37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
};
public Win1251Model() : base(WIN1251_CHAR_TO_ORDER_MAP, "windows-1251")
{
}
}
public class Latin5Model : CyrillicModel
{
private readonly static byte[] LATIN5_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
};
public Latin5Model() : base(LATIN5_CHAR_TO_ORDER_MAP, "ISO-8859-5")
{
}
}
public class MacCyrillicModel : CyrillicModel
{
private readonly static byte[] MACCYRILLIC_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
239,240,241,242,243,244,245,246,247,248,249,250,251,252, 68, 16,
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
};
public MacCyrillicModel() : base(MACCYRILLIC_CHAR_TO_ORDER_MAP,
"x-mac-cyrillic")
{
}
}
public class Ibm855Model : CyrillicModel
{
private readonly static byte[] IBM855_BYTE_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
191,192,193,194, 68,195,196,197,198,199,200,201,202,203,204,205,
206,207,208,209,210,211,212,213,214,215,216,217, 27, 59, 54, 70,
3, 37, 21, 44, 28, 58, 13, 41, 2, 48, 39, 53, 19, 46,218,219,
220,221,222,223,224, 26, 55, 4, 42,225,226,227,228, 23, 60,229,
230,231,232,233,234,235, 11, 36,236,237,238,239,240,241,242,243,
8, 49, 12, 38, 5, 31, 1, 34, 15,244,245,246,247, 35, 16,248,
43, 9, 45, 7, 32, 6, 40, 14, 52, 24, 56, 10, 33, 17, 61,249,
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
};
public Ibm855Model() : base(IBM855_BYTE_TO_ORDER_MAP, "IBM855")
{
}
}
public class Ibm866Model : CyrillicModel
{
private readonly static byte[] IBM866_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
};
public Ibm866Model() : base(IBM866_CHAR_TO_ORDER_MAP, "IBM866")
{
}
}
}

View File

@ -0,0 +1,244 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public abstract class GreekModel : SequenceModel
{
// Model Table:
// total sequences: 100%
// first 512 sequences: 98.2851%
// first 1024 sequences:1.7001%
// rest sequences: 0.0359%
// negative sequences: 0.0148%
private readonly static byte[] GREEK_LANG_MODEL = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0,
3,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,0,3,3,0,3,2,3,3,0,3,2,3,3,3,0,0,3,0,3,0,3,3,2,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,
0,2,3,2,2,3,3,3,3,3,3,3,3,0,3,3,3,3,0,2,3,3,0,3,3,3,3,2,3,3,3,0,
2,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,2,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,2,1,3,3,3,3,2,3,3,2,3,3,2,0,
0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,0,3,3,3,3,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,2,3,3,0,
2,0,1,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,2,3,0,0,0,0,3,3,0,3,1,3,3,3,0,3,3,0,3,3,3,3,0,0,0,0,
2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,0,3,0,3,3,3,3,3,0,3,2,2,2,3,0,2,3,3,3,3,3,2,3,3,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,3,2,2,2,3,3,3,3,0,3,1,3,3,3,3,2,3,3,3,3,3,3,3,2,2,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,2,0,3,0,0,0,3,3,2,3,3,3,3,3,0,0,3,2,3,0,2,3,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,0,3,3,3,3,0,0,3,3,0,2,3,0,3,0,3,3,3,0,0,3,0,3,0,2,2,3,3,0,0,
0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,2,0,3,2,3,3,3,3,0,3,3,3,3,3,0,3,3,2,3,2,3,3,2,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,2,3,2,3,3,3,3,3,3,0,2,3,2,3,2,2,2,3,2,3,3,2,3,0,2,2,2,3,0,
2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,0,0,0,3,3,3,2,3,3,0,0,3,0,3,0,0,0,3,2,0,3,0,3,0,0,2,0,2,0,
0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,0,3,3,3,3,3,3,0,3,3,0,3,0,0,0,3,3,0,3,3,3,0,0,1,2,3,0,
3,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,2,0,0,3,2,2,3,3,0,3,3,3,3,3,2,1,3,0,3,2,3,3,2,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,3,0,2,3,3,3,3,3,3,0,0,3,0,3,0,0,0,3,3,0,3,2,3,0,0,3,3,3,0,
3,0,0,0,2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,0,3,3,3,3,3,3,0,0,3,0,3,0,0,0,3,2,0,3,2,3,0,0,3,2,3,0,
2,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,1,2,2,3,3,3,3,3,3,0,2,3,0,3,0,0,0,3,3,0,3,0,2,0,0,2,3,1,0,
2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,0,3,3,3,3,0,3,0,3,3,2,3,0,3,3,3,3,3,3,0,3,3,3,0,2,3,0,0,3,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,0,3,3,3,0,0,3,0,0,0,3,3,0,3,0,2,3,3,0,0,3,0,3,0,3,3,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,0,0,0,3,3,3,3,3,3,0,0,3,0,2,0,0,0,3,3,0,3,0,3,0,0,2,0,2,0,
0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,3,0,3,0,2,0,3,2,0,3,2,3,2,3,0,0,3,2,3,2,3,3,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,0,0,2,3,3,3,3,3,0,0,0,3,0,2,1,0,0,3,2,2,2,0,3,0,0,2,2,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,0,3,3,3,2,0,3,0,3,0,3,3,0,2,1,2,3,3,0,0,3,0,3,0,3,3,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,2,3,3,3,0,3,3,3,3,3,3,0,2,3,0,3,0,0,0,2,1,0,2,2,3,0,0,2,2,2,0,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,0,0,2,3,3,3,2,3,0,0,1,3,0,2,0,0,0,0,3,0,1,0,2,0,0,1,1,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,1,0,3,0,0,0,3,2,0,3,2,3,3,3,0,0,3,0,3,2,2,2,1,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,0,3,3,3,0,0,3,0,0,0,0,2,0,2,3,3,2,2,2,2,3,0,2,0,2,2,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,2,0,0,0,0,0,0,2,3,0,2,0,2,3,2,0,0,3,0,3,0,3,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,3,2,3,3,2,2,3,0,2,0,3,0,0,0,2,0,0,0,0,1,2,0,2,0,2,0,
0,2,0,2,0,2,2,0,0,1,0,2,2,2,0,2,2,2,0,2,2,2,0,0,2,0,0,1,0,0,0,0,
0,2,0,3,3,2,0,0,0,0,0,0,1,3,0,2,0,2,2,2,0,0,2,0,3,0,0,2,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,0,2,3,2,0,2,2,0,2,0,2,2,0,2,0,2,2,2,0,0,0,0,0,0,2,3,0,0,0,2,
0,1,2,0,0,0,0,2,2,0,0,0,2,1,0,2,2,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,
0,0,2,1,0,2,3,2,2,3,2,3,2,0,0,3,3,3,0,0,3,2,0,0,0,1,1,0,2,0,2,2,
0,2,0,2,0,2,2,0,0,2,0,2,2,2,0,2,2,2,2,0,0,2,0,0,0,2,0,1,0,0,0,0,
0,3,0,3,3,2,2,0,3,0,0,0,2,2,0,2,2,2,1,2,0,0,1,2,2,0,0,3,0,0,0,2,
0,1,2,0,0,0,1,2,0,0,0,0,0,0,0,2,2,0,1,0,0,2,0,0,0,2,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,2,3,3,2,2,0,0,0,2,0,2,3,3,0,2,0,0,0,0,0,0,2,2,2,0,2,2,0,2,0,2,
0,2,2,0,0,2,2,2,2,1,0,0,2,2,0,2,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,
0,2,0,3,2,3,0,0,0,3,0,0,2,2,0,2,0,2,2,2,0,0,2,0,0,0,0,0,0,0,0,2,
0,0,2,2,0,0,2,2,2,0,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,2,0,0,3,2,0,2,2,2,2,2,0,0,0,2,0,0,0,0,2,0,1,0,0,2,0,1,0,0,0,
0,2,2,2,0,2,2,0,1,2,0,2,2,2,0,2,2,2,2,1,2,2,0,0,2,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,2,0,2,0,2,2,0,0,0,0,1,2,1,0,0,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,3,2,3,0,0,2,0,0,0,2,2,0,2,0,0,0,1,0,0,2,0,2,0,2,2,0,0,0,0,
0,0,2,0,0,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,
0,2,2,3,2,2,0,0,0,0,0,0,1,3,0,2,0,2,2,0,0,0,1,0,2,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,2,0,2,0,3,2,0,2,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
0,0,2,0,0,0,0,1,1,0,0,2,1,2,0,2,2,0,1,0,0,1,0,0,0,2,0,0,0,0,0,0,
0,3,0,2,2,2,0,0,2,0,0,0,2,0,0,0,2,3,0,2,0,0,0,0,0,0,2,2,0,0,0,2,
0,1,2,0,0,0,1,2,2,1,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,2,1,2,0,2,2,0,2,0,0,2,0,0,0,0,1,2,1,0,2,1,0,0,0,0,0,0,0,0,0,0,
0,0,2,0,0,0,3,1,2,2,0,2,0,0,0,0,2,0,0,0,2,0,0,3,0,0,0,0,2,2,2,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,2,1,0,2,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,2,
0,2,2,0,0,2,2,2,2,2,0,1,2,0,0,0,2,2,0,1,0,2,0,0,2,2,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0,0,0,0,0,2,0,2,0,0,0,0,2,
0,1,2,0,0,0,0,2,2,1,0,1,0,1,0,2,2,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,
0,2,0,1,2,0,0,0,0,0,0,0,0,0,0,2,0,0,2,2,0,0,0,0,1,0,0,0,0,0,0,2,
0,2,2,0,0,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,
0,2,2,2,2,0,0,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,1,
0,0,2,0,0,0,0,1,2,0,0,0,0,0,0,2,2,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,
0,2,0,2,2,2,0,0,2,0,0,0,0,0,0,0,2,2,2,0,0,0,2,0,0,0,0,0,0,0,0,2,
0,0,1,0,0,0,0,2,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,3,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,2,
0,0,2,0,0,0,0,2,2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,2,0,2,2,1,0,0,0,0,0,0,2,0,0,2,0,2,2,2,0,0,0,0,0,0,2,0,0,0,0,2,
0,0,2,0,0,2,0,2,2,0,0,0,0,2,0,2,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,
0,0,3,0,0,0,2,2,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,
0,2,2,2,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,
0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,2,0,0,0,2,0,0,0,0,0,1,0,0,0,0,2,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,2,0,0,0,
0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,2,0,2,0,0,0,
0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};
public GreekModel(byte[] charToOrderMap, string name)
: base(charToOrderMap, GREEK_LANG_MODEL, 0.982851f, false, name)
{
}
}
public class Latin7Model : GreekModel
{
/****************************************************************
255: Control characters that usually does not exist in any text
254: Carriage/Return
253: symbol (punctuation) that does not belong to word
252: 0 - 9
*****************************************************************/
//Character Mapping Table:
private readonly static byte[] LATIN7_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, //40
79,118,105, 83, 67,114,119, 95, 99,109,188,253,253,253,253,253, //50
253, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, //60
78,115, 65, 66, 58, 76,106,103, 87,107,112,253,253,253,253,253, //70
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //80
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //90
+253,233, 90,253,253,253,253,253,253,253,253,253,253, 74,253,253, //a0
253,253,253,253,247,248, 61, 36, 46, 71, 73,253, 54,253,108,123, //b0
110, 31, 51, 43, 41, 34, 91, 40, 52, 47, 44, 53, 38, 49, 59, 39, //c0
35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, //d0
124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, //e0
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, //f0
};
public Latin7Model() : base(LATIN7_CHAR_TO_ORDER_MAP, "ISO-8859-7")
{
}
}
public class Win1253Model : GreekModel
{
private readonly static byte[] WIN1253__CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, //40
79,118,105, 83, 67,114,119, 95, 99,109,188,253,253,253,253,253, //50
253, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, //60
78,115, 65, 66, 58, 76,106,103, 87,107,112,253,253,253,253,253, //70
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //80
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //90
+253,233, 61,253,253,253,253,253,253,253,253,253,253, 74,253,253, //a0
253,253,253,253,247,253,253, 36, 46, 71, 73,253, 54,253,108,123, //b0
110, 31, 51, 43, 41, 34, 91, 40, 52, 47, 44, 53, 38, 49, 59, 39, //c0
35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, //d0
124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, //e0
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, //f0
};
public Win1253Model() : base(WIN1253__CHAR_TO_ORDER_MAP, "windows-1253")
{
}
}
}

View File

@ -0,0 +1,220 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public abstract class HebrewModel : SequenceModel
{
//Model Table:
//total sequences: 100%
//first 512 sequences: 98.4004%
//first 1024 sequences: 1.5981%
//rest sequences: 0.087%
//negative sequences: 0.0015%
private readonly static byte[] HEBREW_LANG_MODEL = {
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,
1,2,1,2,1,2,0,0,2,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,
1,2,1,3,1,1,0,0,2,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,0,1,2,2,1,3,
1,2,1,1,2,2,0,0,2,2,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,2,2,3,2,
1,2,1,2,2,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,2,2,3,2,2,2,1,2,2,2,2,
1,2,1,1,2,2,0,1,2,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,2,2,2,2,
0,2,0,2,2,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,2,2,
0,2,1,2,2,2,0,0,2,1,0,0,0,0,1,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,2,1,2,3,2,2,2,
1,2,1,2,2,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,
3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,1,0,2,0,2,
0,2,1,2,2,2,0,0,1,2,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,2,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,2,3,2,1,2,1,1,1,
0,1,1,1,1,1,3,0,1,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,0,1,0,0,1,0,0,0,0,
0,0,1,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,
0,2,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,2,3,3,3,2,1,2,3,3,2,3,3,3,3,2,3,2,1,2,0,2,1,2,
0,2,0,2,2,2,0,0,1,2,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,
3,3,3,3,3,3,3,3,3,2,3,3,3,1,2,2,3,3,2,3,2,3,2,2,3,1,2,2,0,2,2,2,
0,2,1,2,2,2,0,0,1,2,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,2,2,3,3,3,3,1,3,2,2,2,
0,2,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,3,2,2,2,1,2,2,0,2,2,2,2,
0,2,0,2,2,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,1,3,2,3,3,2,3,3,2,2,1,2,2,2,2,2,2,
0,2,1,2,1,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,2,3,2,3,3,2,3,3,3,3,2,3,2,3,3,3,3,3,2,2,2,2,2,2,2,1,
0,2,0,1,2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,2,1,2,3,3,3,3,3,3,3,2,3,2,3,2,1,2,3,0,2,1,2,2,
0,2,1,1,2,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,
3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,1,3,1,2,2,2,1,2,3,3,1,2,1,2,2,2,2,
0,1,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,0,2,3,3,3,1,3,3,3,1,2,2,2,2,1,1,2,2,2,2,2,2,
0,2,0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,2,3,3,3,2,2,3,3,3,2,1,2,3,2,3,2,2,2,2,1,2,1,1,1,2,2,
0,2,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,1,0,0,0,0,0,
1,0,1,0,0,0,0,0,2,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,2,3,3,2,3,1,2,2,2,2,3,2,3,1,1,2,2,1,2,2,1,1,0,2,2,2,2,
0,1,0,1,2,2,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
3,0,0,1,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,2,0,
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,1,0,1,0,1,1,0,1,1,0,0,0,1,1,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
3,2,2,1,2,2,2,2,2,2,2,1,2,2,1,2,2,1,1,1,1,1,1,1,1,2,1,1,0,3,3,3,
0,3,0,2,2,2,2,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
2,2,2,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,1,2,2,2,1,1,1,2,0,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,0,2,2,0,0,0,0,0,0,
0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,3,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,1,0,2,1,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
0,3,1,1,2,2,2,2,2,1,2,2,2,1,1,2,2,2,2,2,2,2,1,2,2,1,0,1,1,1,1,0,
0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,2,1,1,1,1,2,1,1,2,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,
0,0,2,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,1,0,0,
2,1,1,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,1,2,1,2,1,1,1,1,0,0,0,0,
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,2,1,2,2,2,2,2,2,2,2,2,2,1,2,1,2,1,1,2,1,1,1,2,1,2,1,2,0,1,0,1,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,1,2,2,2,1,2,2,2,2,2,2,2,2,1,2,1,1,1,1,1,1,2,1,2,1,1,0,1,0,1,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,1,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,
0,2,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,1,1,1,1,1,1,1,0,1,1,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,2,0,1,1,1,0,1,0,0,0,1,1,0,1,1,0,0,0,0,0,1,1,0,0,
0,1,1,1,2,1,2,2,2,0,2,0,2,0,1,1,2,1,1,1,1,2,1,0,1,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,1,0,0,0,0,0,1,0,1,2,2,0,1,0,0,1,1,2,2,1,2,0,2,0,0,0,1,2,0,1,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,2,0,2,1,2,0,2,0,0,1,1,1,1,1,1,0,1,0,0,0,1,0,0,1,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,0,0,0,0,1,0,2,1,1,0,1,0,0,1,1,1,2,2,0,0,1,0,0,0,1,0,0,1,
1,1,2,1,0,1,1,1,0,1,0,1,1,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,2,2,1,
0,2,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,1,0,0,1,0,1,1,1,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,2,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,1,1,0,1,1,0,1,0,0,0,1,1,0,1,
2,0,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,1,1,1,0,1,0,0,1,1,2,1,1,2,0,1,0,0,0,1,1,0,1,
1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,1,1,2,0,1,0,0,0,0,2,1,1,2,0,2,0,0,0,1,1,0,1,
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,2,1,1,0,1,0,0,2,2,1,2,1,1,0,1,0,0,0,1,1,0,1,
2,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,2,2,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,1,0,1,
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,2,2,0,0,0,0,2,1,1,1,0,2,1,1,0,0,0,2,1,0,1,
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,1,1,2,0,1,0,0,1,1,0,2,1,1,0,1,0,0,0,1,1,0,1,
2,2,1,1,1,0,1,1,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,2,1,1,0,1,0,0,1,1,0,1,2,1,0,2,0,0,0,1,1,0,1,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,
0,1,0,0,2,0,2,1,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,1,1,2,0,1,0,0,1,1,1,0,1,0,0,1,0,0,0,1,0,0,1,
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,2,1,1,1,1,1,0,1,0,0,0,0,1,0,1,
0,1,1,1,2,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
};
public HebrewModel(byte[] charToOrderMap, string name)
: base(charToOrderMap, HEBREW_LANG_MODEL, 0.984004f, false, name)
{
}
}
public class Win1255Model : HebrewModel
{
/*
255: Control characters that usually does not exist in any text
254: Carriage/Return
253: symbol (punctuation) that does not belong to word
252: 0 - 9
*/
//Windows-1255 language model
//Character Mapping Table:
private readonly static byte[] WIN1255_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253, 69, 91, 79, 80, 92, 89, 97, 90, 68,111,112, 82, 73, 95, 85, //40
78,121, 86, 71, 67,102,107, 84,114,103,115,253,253,253,253,253, //50
253, 50, 74, 60, 61, 42, 76, 70, 64, 53,105, 93, 56, 65, 54, 49, //60
66,110, 51, 43, 44, 63, 81, 77, 98, 75,108,253,253,253,253,253, //70
124,202,203,204,205, 40, 58,206,207,208,209,210,211,212,213,214,
215, 83, 52, 47, 46, 72, 32, 94,216,113,217,109,218,219,220,221,
34,116,222,118,100,223,224,117,119,104,125,225,226, 87, 99,227,
106,122,123,228, 55,229,230,101,231,232,120,233, 48, 39, 57,234,
30, 59, 41, 88, 33, 37, 36, 31, 29, 35,235, 62, 28,236,126,237,
238, 38, 45,239,240,241,242,243,127,244,245,246,247,248,249,250,
9, 8, 20, 16, 3, 2, 24, 14, 22, 1, 25, 15, 4, 11, 6, 23,
12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,251,252,128, 96,253,
};
public Win1255Model() : base(WIN1255_CHAR_TO_ORDER_MAP, "windows-1255")
{
}
}
}

View File

@ -0,0 +1,238 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public abstract class HungarianModel : SequenceModel
{
//Model Table:
//total sequences: 100%
//first 512 sequences: 94.7368%
//first 1024 sequences:5.2623%
//rest sequences: 0.8894%
//negative sequences: 0.0009%
private readonly static byte[] HUNGARIAN_LANG_MODEL = {
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,
3,2,1,3,3,3,3,3,2,3,3,3,3,3,1,1,2,3,3,3,3,3,3,3,1,1,3,2,0,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,1,1,2,3,3,3,1,3,3,3,3,3,1,3,3,2,2,0,3,2,3,
0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,3,3,3,2,3,3,2,3,3,3,3,3,2,3,3,2,2,3,2,3,2,0,3,2,2,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,
3,3,3,3,3,3,2,3,3,3,3,3,2,3,3,3,1,2,3,2,2,3,1,2,3,3,2,2,0,3,3,3,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,0,2,3,2,
0,0,0,1,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,1,1,1,3,3,2,1,3,2,2,3,2,1,3,2,2,1,0,3,3,1,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,2,2,3,3,3,3,3,1,2,3,3,3,3,1,2,1,3,3,3,3,2,2,3,1,1,3,2,0,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,2,1,3,3,3,3,3,2,2,1,3,3,3,0,1,1,2,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,3,3,3,2,0,3,2,3,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,
3,3,3,3,3,3,2,3,3,3,2,3,2,3,3,3,1,3,2,2,2,3,1,1,3,3,1,1,0,3,3,2,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,2,3,3,3,2,3,2,3,3,3,2,3,3,3,3,3,1,2,3,2,2,0,2,2,2,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,3,3,2,2,2,3,1,3,3,2,2,1,3,3,3,1,1,3,1,2,3,2,3,2,2,2,1,0,2,2,2,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,
3,1,1,3,3,3,3,3,1,2,3,3,3,3,1,2,1,3,3,3,2,2,3,2,1,0,3,2,0,1,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,1,3,3,3,3,3,1,2,3,3,3,3,1,1,0,3,3,3,3,0,2,3,0,0,2,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,2,3,3,2,2,2,2,3,3,0,1,2,3,2,3,2,2,3,2,1,2,0,2,2,2,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,
3,3,3,3,3,3,1,2,3,3,3,2,1,2,3,3,2,2,2,3,2,3,3,1,3,3,1,1,0,2,3,2,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,3,3,1,2,2,2,2,3,3,3,1,1,1,3,3,1,1,3,1,1,3,2,1,2,3,1,1,0,2,2,2,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,3,3,2,1,2,1,1,3,3,1,1,1,1,3,3,1,1,2,2,1,2,1,1,2,2,1,1,0,2,2,1,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,3,3,1,1,2,1,1,3,3,1,0,1,1,3,3,2,0,1,1,2,3,1,0,2,2,1,0,0,1,3,2,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,2,1,3,3,3,3,3,1,2,3,2,3,3,2,1,1,3,2,3,2,1,2,2,0,1,2,1,0,0,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,3,3,3,2,2,2,2,3,1,2,2,1,1,3,3,0,3,2,1,2,3,2,1,3,3,1,1,0,2,1,3,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,3,3,2,2,2,3,2,3,3,3,2,1,1,3,3,1,1,1,2,2,3,2,3,2,2,2,1,0,2,2,1,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
1,0,0,3,3,3,3,3,0,0,3,3,2,3,0,0,0,2,3,3,1,0,1,2,0,0,1,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,2,3,3,3,3,3,1,2,3,3,2,2,1,1,0,3,3,2,2,1,2,2,1,0,2,2,0,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,2,2,1,3,1,2,3,3,2,2,1,1,2,2,1,1,1,1,3,2,1,1,1,1,2,1,0,1,2,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
2,3,3,1,1,1,1,1,3,3,3,0,1,1,3,3,1,1,1,1,1,2,2,0,3,1,1,2,0,2,1,1,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,1,0,1,2,1,2,2,0,1,2,3,1,2,0,0,0,2,1,1,1,1,1,2,0,0,1,1,0,0,0,0,
1,2,1,2,2,2,1,2,1,2,0,2,0,2,2,1,1,2,1,1,2,1,1,1,0,1,0,0,0,1,1,0,
1,1,1,2,3,2,3,3,0,1,2,2,3,1,0,1,0,2,1,2,2,0,1,1,0,0,1,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,3,3,2,2,1,0,0,3,2,3,2,0,0,0,1,1,3,0,0,1,1,0,0,2,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,1,2,2,3,3,1,0,1,3,2,3,1,1,1,0,1,1,1,1,1,3,1,0,0,2,2,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,1,1,2,2,2,1,0,1,2,3,3,2,0,0,0,2,1,1,1,2,1,1,1,0,1,1,1,0,0,0,
1,2,2,2,2,2,1,1,1,2,0,2,1,1,1,1,1,2,1,1,1,1,1,1,0,1,1,1,0,0,1,1,
3,2,2,1,0,0,1,1,2,2,0,3,0,1,2,1,1,0,0,1,1,1,0,1,1,1,1,0,2,1,1,1,
2,2,1,1,1,2,1,2,1,1,1,1,1,1,1,2,1,1,1,2,3,1,1,1,1,1,1,1,1,1,0,1,
2,3,3,0,1,0,0,0,3,3,1,0,0,1,2,2,1,0,0,0,0,2,0,0,1,1,1,0,2,1,1,1,
2,1,1,1,1,1,1,2,1,1,0,1,1,0,1,1,1,0,1,2,1,1,0,1,1,1,1,1,1,1,0,1,
2,3,3,0,1,0,0,0,2,2,0,0,0,0,1,2,2,0,0,0,0,1,0,0,1,1,0,0,2,0,1,0,
2,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,2,0,1,1,1,1,1,0,1,
3,2,2,0,1,0,1,0,2,3,2,0,0,1,2,2,1,0,0,1,1,1,0,0,2,1,0,1,2,2,1,1,
2,1,1,1,1,1,1,2,1,1,1,1,1,1,0,2,1,0,1,1,0,1,1,1,0,1,1,2,1,1,0,1,
2,2,2,0,0,1,0,0,2,2,1,1,0,0,2,1,1,0,0,0,1,2,0,0,2,1,0,0,2,1,1,1,
2,1,1,1,1,2,1,2,1,1,1,2,2,1,1,2,1,1,1,2,1,1,1,1,1,1,1,1,1,1,0,1,
1,2,3,0,0,0,1,0,3,2,1,0,0,1,2,1,1,0,0,0,0,2,1,0,1,1,0,0,2,1,2,1,
1,1,0,0,0,1,0,1,1,1,1,1,2,0,0,1,0,0,0,2,0,0,1,1,1,1,1,1,1,1,0,1,
3,0,0,2,1,2,2,1,0,0,2,1,2,2,0,0,0,2,1,1,1,0,1,1,0,0,1,1,2,0,0,0,
1,2,1,2,2,1,1,2,1,2,0,1,1,1,1,1,1,1,1,1,2,1,1,0,0,1,1,1,1,0,0,1,
1,3,2,0,0,0,1,0,2,2,2,0,0,0,2,2,1,0,0,0,0,3,1,1,1,1,0,0,2,1,1,1,
2,1,0,1,1,1,0,1,1,1,1,1,1,1,0,2,1,0,0,1,0,1,1,0,1,1,1,1,1,1,0,1,
2,3,2,0,0,0,1,0,2,2,0,0,0,0,2,1,1,0,0,0,0,2,1,0,1,1,0,0,2,1,1,0,
2,1,1,1,1,2,1,2,1,2,0,1,1,1,0,2,1,1,1,2,1,1,1,1,0,1,1,1,1,1,0,1,
3,1,1,2,2,2,3,2,1,1,2,2,1,1,0,1,0,2,2,1,1,1,1,1,0,0,1,1,0,1,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,0,0,0,0,0,2,2,0,0,0,0,2,2,1,0,0,0,1,1,0,0,1,2,0,0,2,1,1,1,
2,2,1,1,1,2,1,2,1,1,0,1,1,1,1,2,1,1,1,2,1,1,1,1,0,1,2,1,1,1,0,1,
1,0,0,1,2,3,2,1,0,0,2,0,1,1,0,0,0,1,1,1,1,0,1,1,0,0,1,0,0,0,0,0,
1,2,1,2,1,2,1,1,1,2,0,2,1,1,1,0,1,2,0,0,1,1,1,0,0,0,0,0,0,0,0,0,
2,3,2,0,0,0,0,0,1,1,2,1,0,0,1,1,1,0,0,0,0,2,0,0,1,1,0,0,2,1,1,1,
2,1,1,1,1,1,1,2,1,0,1,1,1,1,0,2,1,1,1,1,1,1,0,1,0,1,1,1,1,1,0,1,
1,2,2,0,1,1,1,0,2,2,2,0,0,0,3,2,1,0,0,0,1,1,0,0,1,1,0,1,1,1,0,0,
1,1,0,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,0,0,1,1,1,0,1,0,1,
2,1,0,2,1,1,2,2,1,1,2,1,1,1,0,0,0,1,1,0,1,1,1,1,0,0,1,1,1,0,0,0,
1,2,2,2,2,2,1,1,1,2,0,2,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,1,0,
1,2,3,0,0,0,1,0,2,2,0,0,0,0,2,2,0,0,0,0,0,1,0,0,1,0,0,0,2,0,1,0,
2,1,1,1,1,1,0,2,0,0,0,1,2,1,1,1,1,0,1,2,0,1,0,1,0,1,1,1,0,1,0,1,
2,2,2,0,0,0,1,0,2,1,2,0,0,0,1,1,2,0,0,0,0,1,0,0,1,1,0,0,2,1,0,1,
2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,0,1,1,1,1,1,0,1,
1,2,2,0,0,0,1,0,2,2,2,0,0,0,1,1,0,0,0,0,0,1,1,0,2,0,0,1,1,1,0,1,
1,0,1,1,1,1,1,1,0,1,1,1,1,0,0,1,0,0,1,1,0,1,0,1,1,1,1,1,0,0,0,1,
1,0,0,1,0,1,2,1,0,0,1,1,1,2,0,0,0,1,1,0,1,0,1,1,0,0,1,0,0,0,0,0,
0,2,1,2,1,1,1,1,1,2,0,2,0,1,1,0,1,2,1,0,1,1,1,0,0,0,0,0,0,1,0,0,
2,1,1,0,1,2,0,0,1,1,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,2,1,0,1,
2,2,1,1,1,1,1,2,1,1,0,1,1,1,1,2,1,1,1,2,1,1,0,1,0,1,1,1,1,1,0,1,
1,2,2,0,0,0,0,0,1,1,0,0,0,0,2,1,0,0,0,0,0,2,0,0,2,2,0,0,2,0,0,1,
2,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,0,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,
1,1,2,0,0,3,1,0,2,1,1,1,0,0,1,1,1,0,0,0,1,1,0,0,0,1,0,0,1,0,1,0,
1,2,1,0,1,1,1,2,1,1,0,1,1,1,1,1,0,0,0,1,1,1,1,1,0,1,0,0,0,1,0,0,
2,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,2,0,0,0,
2,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,2,1,1,0,0,1,1,1,1,1,0,1,
2,1,1,1,2,1,1,1,0,1,1,2,1,0,0,0,0,1,1,1,1,0,1,0,0,0,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,0,1,1,1,1,1,0,0,1,1,2,1,0,0,0,1,1,0,0,0,1,1,0,0,1,0,1,0,0,0,
1,2,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,1,0,0,
2,0,0,0,1,1,1,1,0,0,1,1,0,0,0,0,0,1,1,1,2,0,0,1,0,0,1,0,1,0,0,0,
0,1,1,1,1,1,1,1,1,2,0,1,1,1,1,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,
1,0,0,1,1,1,1,1,0,0,2,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,
0,1,1,1,1,1,1,0,1,1,0,1,0,1,1,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,
1,0,0,1,1,1,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
0,1,1,1,1,1,0,0,1,1,0,1,0,1,0,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,
0,0,0,1,0,0,0,0,0,0,1,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,1,1,0,1,0,0,1,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,
2,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,0,1,0,0,1,0,1,0,1,1,1,0,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,1,1,1,1,0,0,0,1,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
};
public HungarianModel(byte[] charToOrderMap, string name)
: base(charToOrderMap, HUNGARIAN_LANG_MODEL, 0.947368f,
false, name)
{
}
}
public class Latin2HungarianModel : HungarianModel
{
private readonly static byte[] LATIN2_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253, 28, 40, 54, 45, 32, 50, 49, 38, 39, 53, 36, 41, 34, 35, 47,
46, 71, 43, 33, 37, 57, 48, 64, 68, 55, 52,253,253,253,253,253,
253, 2, 18, 26, 17, 1, 27, 12, 20, 9, 22, 7, 6, 13, 4, 8,
23, 67, 10, 5, 3, 21, 19, 65, 62, 16, 11,253,253,253,253,253,
159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,
175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,
191,192,193,194,195,196,197, 75,198,199,200,201,202,203,204,205,
79,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,
221, 51, 81,222, 78,223,224,225,226, 44,227,228,229, 61,230,231,
232,233,234, 58,235, 66, 59,236,237,238, 60, 69, 63,239,240,241,
82, 14, 74,242, 70, 80,243, 72,244, 15, 83, 77, 84, 30, 76, 85,
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
};
public Latin2HungarianModel() : base(LATIN2_CHAR_TO_ORDER_MAP, "ISO-8859-2")
{
}
}
public class Win1250HungarianModel : HungarianModel
{
private readonly static byte[] WIN1250_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253, 28, 40, 54, 45, 32, 50, 49, 38, 39, 53, 36, 41, 34, 35, 47,
46, 72, 43, 33, 37, 57, 48, 64, 68, 55, 52,253,253,253,253,253,
253, 2, 18, 26, 17, 1, 27, 12, 20, 9, 22, 7, 6, 13, 4, 8,
23, 67, 10, 5, 3, 21, 19, 65, 62, 16, 11,253,253,253,253,253,
161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,
177,178,179,180, 78,181, 69,182,183,184,185,186,187,188,189,190,
191,192,193,194,195,196,197, 76,198,199,200,201,202,203,204,205,
81,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,
221, 51, 83,222, 80,223,224,225,226, 44,227,228,229, 61,230,231,
232,233,234, 58,235, 66, 59,236,237,238, 60, 70, 63,239,240,241,
84, 14, 75,242, 71, 82,243, 73,244, 15, 85, 79, 86, 30, 77, 87,
245,246,247, 25, 74, 42, 24,248,249,250, 31, 56, 29,251,252,253,
};
public Win1250HungarianModel() : base(WIN1250_CHAR_TO_ORDER_MAP, "windows-1250")
{
}
}
}

View File

@ -0,0 +1,213 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public class ThaiModel : SequenceModel
{
/****************************************************************
255: Control characters that usually does not exist in any text
254: Carriage/Return
253: symbol (punctuation) that does not belong to word
252: 0 - 9
*****************************************************************/
// The following result for thai was collected from a limited sample (1M)
private readonly static byte[] TIS620_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253,182,106,107,100,183,184,185,101, 94,186,187,108,109,110,111, //40
188,189,190, 89, 95,112,113,191,192,193,194,253,253,253,253,253, //50
253, 64, 72, 73,114, 74,115,116,102, 81,201,117, 90,103, 78, 82, //60
96,202, 91, 79, 84,104,105, 97, 98, 92,203,253,253,253,253,253, //70
209,210,211,212,213, 88,214,215,216,217,218,219,220,118,221,222,
223,224, 99, 85, 83,225,226,227,228,229,230,231,232,233,234,235,
236, 5, 30,237, 24,238, 75, 8, 26, 52, 34, 51,119, 47, 58, 57,
49, 53, 55, 43, 20, 19, 44, 14, 48, 3, 17, 25, 39, 62, 31, 54,
45, 9, 16, 2, 61, 15,239, 12, 42, 46, 18, 21, 76, 4, 66, 63,
22, 10, 1, 36, 23, 13, 40, 27, 32, 35, 86,240,241,242,243,244,
11, 28, 41, 29, 33,245, 50, 37, 6, 7, 67, 77, 38, 93,246,247,
68, 56, 59, 65, 69, 60, 70, 80, 71, 87,248,249,250,251,252,253,
};
//Model Table:
//total sequences: 100%
//first 512 sequences: 92.6386%
//first 1024 sequences:7.3177%
//rest sequences: 1.0230%
//negative sequences: 0.0436%
private readonly static byte[] THAI_LANG_MODEL = {
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
3,0,3,3,2,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,0,3,2,3,0,2,2,2,3,
0,2,3,0,0,0,0,1,0,1,2,3,1,1,3,2,2,0,1,1,0,0,1,0,0,0,0,0,0,0,1,1,
3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,3,3,2,3,2,3,3,2,2,2,
3,1,2,3,0,3,3,2,2,1,2,3,3,1,2,0,1,3,0,1,0,0,1,0,0,0,0,0,0,0,1,1,
3,3,2,2,3,3,3,3,1,2,3,3,3,3,3,2,2,2,2,3,3,2,2,3,3,2,2,3,2,3,2,2,
3,3,1,2,3,1,2,2,3,3,1,0,2,1,0,0,3,1,2,1,0,0,1,0,0,0,0,0,0,1,0,1,
3,3,3,3,3,3,2,2,3,3,3,3,2,3,2,2,3,3,2,2,3,2,2,2,2,1,1,3,1,2,1,1,
3,2,1,0,2,1,0,1,0,1,1,0,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,
3,3,3,2,3,2,3,3,2,2,3,2,3,3,2,3,1,1,2,3,2,2,2,3,2,2,2,2,2,1,2,1,
2,2,1,1,3,3,2,1,0,1,2,2,0,1,3,0,0,0,1,1,0,0,0,0,0,2,3,0,0,2,1,1,
3,3,2,3,3,2,0,0,3,3,0,3,3,0,2,2,3,1,2,2,1,1,1,0,2,2,2,0,2,2,1,1,
0,2,1,0,2,0,0,2,0,1,0,0,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,
3,3,2,3,3,2,0,0,3,3,0,2,3,0,2,1,2,2,2,2,1,2,0,0,2,2,2,0,2,2,1,1,
0,2,1,0,2,0,0,2,0,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,
3,3,2,3,2,3,2,0,2,2,1,3,2,1,3,2,1,2,3,2,2,3,0,2,3,2,2,1,2,2,2,2,
1,2,2,0,0,0,0,2,0,1,2,0,1,1,1,0,1,0,3,1,1,0,0,0,0,0,0,0,0,0,1,0,
3,3,2,3,3,2,3,2,2,2,3,2,2,3,2,2,1,2,3,2,2,3,1,3,2,2,2,3,2,2,2,3,
3,2,1,3,0,1,1,1,0,2,1,1,1,1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,2,0,0,
1,0,0,3,0,3,3,3,3,3,0,0,3,0,2,2,3,3,3,3,3,0,0,0,1,1,3,0,0,0,0,2,
0,0,1,0,0,0,0,0,0,0,2,3,0,0,0,3,0,2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,
2,0,3,3,3,3,0,0,2,3,0,0,3,0,3,3,2,3,3,3,3,3,0,0,3,3,3,0,0,0,3,3,
0,0,3,0,0,0,0,2,0,0,2,1,1,3,0,0,1,0,0,2,3,0,1,0,0,0,0,0,0,0,1,0,
3,3,3,3,2,3,3,3,3,3,3,3,1,2,1,3,3,2,2,1,2,2,2,3,1,1,2,0,2,1,2,1,
2,2,1,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,
3,0,2,1,2,3,3,3,0,2,0,2,2,0,2,1,3,2,2,1,2,1,0,0,2,2,1,0,2,1,2,2,
0,1,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,2,1,3,3,1,1,3,0,2,3,1,1,3,2,1,1,2,0,2,2,3,2,1,1,1,1,1,2,
3,0,0,1,3,1,2,1,2,0,3,0,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
3,3,1,1,3,2,3,3,3,1,3,2,1,3,2,1,3,2,2,2,2,1,3,3,1,2,1,3,1,2,3,0,
2,1,1,3,2,2,2,1,2,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,
3,3,2,3,2,3,3,2,3,2,3,2,3,3,2,1,0,3,2,2,2,1,2,2,2,1,2,2,1,2,1,1,
2,2,2,3,0,1,3,1,1,1,1,0,1,1,0,2,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,2,3,2,2,1,1,3,2,3,2,3,2,0,3,2,2,1,2,0,2,2,2,1,2,2,2,2,1,
3,2,1,2,2,1,0,2,0,1,0,0,1,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,2,3,1,2,3,3,2,2,3,0,1,1,2,0,3,3,2,2,3,0,1,1,3,0,0,0,0,
3,1,0,3,3,0,2,0,2,1,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,2,3,2,3,3,0,1,3,1,1,2,1,2,1,1,3,1,1,0,2,3,1,1,1,1,1,1,1,1,
3,1,1,2,2,2,2,1,1,1,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
3,2,2,1,1,2,1,3,3,2,3,2,2,3,2,2,3,1,2,2,1,2,0,3,2,1,2,2,2,2,2,1,
3,2,1,2,2,2,1,1,1,1,0,0,1,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,1,3,3,0,2,1,0,3,2,0,0,3,1,0,1,1,0,1,0,0,0,0,0,1,
1,0,0,1,0,3,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,2,2,2,3,0,0,1,3,0,3,2,0,3,2,2,3,3,3,3,3,1,0,2,2,2,0,2,2,1,2,
0,2,3,0,0,0,0,1,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
3,0,2,3,1,3,3,2,3,3,0,3,3,0,3,2,2,3,2,3,3,3,0,0,2,2,3,0,1,1,1,3,
0,0,3,0,0,0,2,2,0,1,3,0,1,2,2,2,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,
3,2,3,3,2,0,3,3,2,2,3,1,3,2,1,3,2,0,1,2,2,0,2,3,2,1,0,3,0,0,0,0,
3,0,0,2,3,1,3,0,0,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,3,2,2,2,1,2,0,1,3,1,1,3,1,3,0,0,2,1,1,1,1,2,1,1,1,0,2,1,0,1,
1,2,0,0,0,3,1,1,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,3,1,0,0,0,1,0,
3,3,3,3,2,2,2,2,2,1,3,1,1,1,2,0,1,1,2,1,2,1,3,2,0,0,3,1,1,1,1,1,
3,1,0,2,3,0,0,0,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,2,3,0,3,3,0,2,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,2,3,1,3,0,0,1,2,0,0,2,0,3,3,2,3,3,3,2,3,0,0,2,2,2,0,0,0,2,2,
0,0,1,0,0,0,0,3,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
0,0,0,3,0,2,0,0,0,0,0,0,0,0,0,0,1,2,3,1,3,3,0,0,1,0,3,0,0,0,0,0,
0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,1,2,3,1,2,3,1,0,3,0,2,2,1,0,2,1,1,2,0,1,0,0,1,1,1,1,0,1,0,0,
1,0,0,0,0,1,1,0,3,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,2,1,0,1,1,1,3,1,2,2,2,2,2,2,1,1,1,1,0,3,1,0,1,3,1,1,1,1,
1,1,0,2,0,1,3,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,
3,0,2,2,1,3,3,2,3,3,0,1,1,0,2,2,1,2,1,3,3,1,0,0,3,2,0,0,0,0,2,1,
0,1,0,0,0,0,1,2,0,1,1,3,1,1,2,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,3,0,0,1,0,0,0,3,0,0,3,0,3,1,0,1,1,1,3,2,0,0,0,3,0,0,0,0,2,0,
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,
3,3,1,3,2,1,3,3,1,2,2,0,1,2,1,0,1,2,0,0,0,0,0,3,0,0,0,3,0,0,0,0,
3,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,1,2,0,3,3,3,2,2,0,1,1,0,1,3,0,0,0,2,2,0,0,0,0,3,1,0,1,0,0,0,
0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,2,3,1,2,0,0,2,1,0,3,1,0,1,2,0,1,1,1,1,3,0,0,3,1,1,0,2,2,1,1,
0,2,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,0,3,1,2,0,0,2,2,0,1,2,0,1,0,1,3,1,2,1,0,0,0,2,0,3,0,0,0,1,0,
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,1,1,2,2,0,0,0,2,0,2,1,0,1,1,0,1,1,1,2,1,0,0,1,1,1,0,2,1,1,1,
0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,
0,0,0,2,0,1,3,1,1,1,1,0,0,0,0,3,2,0,1,0,0,0,1,2,0,0,0,1,0,0,0,0,
0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,3,3,3,3,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,2,3,2,2,0,0,0,1,0,0,0,0,2,3,2,1,2,2,3,0,0,0,2,3,1,0,0,0,1,1,
0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,
3,3,2,2,0,1,0,0,0,0,2,0,2,0,1,0,0,0,1,1,0,0,0,2,1,0,1,0,1,1,0,0,
0,1,0,2,0,0,1,0,3,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,1,0,0,1,0,0,0,0,0,1,1,2,0,0,0,0,1,0,0,1,3,1,0,0,0,0,1,1,0,0,
0,1,0,0,0,0,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,
3,3,1,1,1,1,2,3,0,0,2,1,1,1,1,1,0,2,1,1,0,0,0,2,1,0,1,2,1,1,0,1,
2,1,0,3,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,3,1,0,0,0,0,0,0,0,3,0,0,0,3,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,
0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,2,0,0,0,0,0,0,1,2,1,0,1,1,0,2,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,2,0,0,0,1,3,0,1,0,0,0,2,0,0,0,0,0,0,0,1,2,0,0,0,0,0,
3,3,0,0,1,1,2,0,0,1,2,1,0,1,1,1,0,1,1,0,0,2,1,1,0,1,0,0,1,1,1,0,
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,1,0,0,0,0,1,0,0,0,0,3,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,
2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,3,0,0,1,1,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,0,1,2,0,1,2,0,0,1,1,0,2,0,1,0,0,1,0,0,0,0,1,0,0,0,2,0,0,0,0,
1,0,0,1,0,1,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,0,0,0,0,0,0,0,1,1,0,1,1,0,2,1,3,0,0,0,0,1,1,0,0,0,0,0,0,0,3,
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,1,0,1,0,0,2,0,0,2,0,0,1,1,2,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,0,
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,1,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,3,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,
1,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,1,0,0,2,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};
public ThaiModel(byte[] charToOrderMap, string name)
: base(TIS620_CHAR_TO_ORDER_MAP, THAI_LANG_MODEL,
0.926386f, false, "TIS-620")
{
}
}
}

View File

@ -0,0 +1,180 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
using System;
namespace UniversalDetector.Core
{
// TODO: Using trigrams the detector should be able to discriminate between
// latin-1 and iso8859-2
public class Latin1Prober : CharsetProber
{
private const int FREQ_CAT_NUM = 4;
private const int UDF = 0; // undefined
private const int OTH = 1; // other
private const int ASC = 2; // ascii capital letter
private const int ASS = 3; // ascii small letter
private const int ACV = 4; // accent capital vowel
private const int ACO = 5; // accent capital other
private const int ASV = 6; // accent small vowel
private const int ASO = 7; // accent small other
private const int CLASS_NUM = 8; // total classes
private readonly static byte[] Latin1_CharToClass = {
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF
};
/* 0 : illegal
1 : very unlikely
2 : normal
3 : very likely
*/
private readonly static byte[] Latin1ClassModel = {
/* UDF OTH ASC ASS ACV ACO ASV ASO */
/*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
/*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3,
/*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3,
/*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3,
/*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2,
/*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3,
/*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3,
/*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3,
};
private byte lastCharClass;
private int[] freqCounter = new int[FREQ_CAT_NUM];
public Latin1Prober()
{
Reset();
}
public override string GetCharsetName()
{
return "windows-1252";
}
public override void Reset()
{
state = ProbingState.Detecting;
lastCharClass = OTH;
for (int i = 0; i < FREQ_CAT_NUM; i++)
freqCounter[i] = 0;
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
byte[] newbuf = FilterWithEnglishLetters(buf, offset, len);
byte charClass, freq;
for (int i = 0; i < newbuf.Length; i++) {
charClass = Latin1_CharToClass[newbuf[i]];
freq = Latin1ClassModel[lastCharClass * CLASS_NUM + charClass];
if (freq == 0) {
state = ProbingState.NotMe;
break;
}
freqCounter[freq]++;
lastCharClass = charClass;
}
return state;
}
public override float GetConfidence()
{
if (state == ProbingState.NotMe)
return 0.01f;
float confidence = 0.0f;
int total = 0;
for (int i = 0; i < FREQ_CAT_NUM; i++) {
total += freqCounter[i];
}
if (total <= 0) {
confidence = 0.0f;
} else {
confidence = freqCounter[3] * 1.0f / total;
confidence -= freqCounter[1] * 20.0f / total;
}
// lower the confidence of latin1 so that other more accurate detector
// can take priority.
return confidence < 0.0f ? 0.0f : confidence * 0.5f;
}
public override void DumpStatus()
{
//Console.WriteLine(" Latin1Prober: {0} [{1}]", GetConfidence(), GetCharsetName());
}
}
}

View File

@ -0,0 +1,175 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
using System;
namespace UniversalDetector.Core
{
/// <summary>
/// Multi-byte charsets probers
/// </summary>
public class MBCSGroupProber : CharsetProber
{
private const int PROBERS_NUM = 7;
private readonly static string[] ProberName =
{ "UTF8", "SJIS", "EUCJP", "GB18030", "EUCKR", "Big5", "EUCTW" };
private CharsetProber[] probers = new CharsetProber[PROBERS_NUM];
private bool[] isActive = new bool[PROBERS_NUM];
private int bestGuess;
private int activeNum;
public MBCSGroupProber()
{
probers[0] = new UTF8Prober();
probers[1] = new SJISProber();
probers[2] = new EUCJPProber();
probers[3] = new GB18030Prober();
probers[4] = new EUCKRProber();
probers[5] = new Big5Prober();
probers[6] = new EUCTWProber();
Reset();
}
public override string GetCharsetName()
{
if (bestGuess == -1) {
GetConfidence();
if (bestGuess == -1)
bestGuess = 0;
}
return probers[bestGuess].GetCharsetName();
}
public override void Reset()
{
activeNum = 0;
for (int i = 0; i < probers.Length; i++) {
if (probers[i] != null) {
probers[i].Reset();
isActive[i] = true;
++activeNum;
} else {
isActive[i] = false;
}
}
bestGuess = -1;
state = ProbingState.Detecting;
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
// do filtering to reduce load to probers
byte[] highbyteBuf = new byte[len];
int hptr = 0;
//assume previous is not ascii, it will do no harm except add some noise
bool keepNext = true;
int max = offset + len;
for (int i = offset; i < max; i++) {
if ((buf[i] & 0x80) != 0) {
highbyteBuf[hptr++] = buf[i];
keepNext = true;
} else {
//if previous is highbyte, keep this even it is a ASCII
if (keepNext) {
highbyteBuf[hptr++] = buf[i];
keepNext = false;
}
}
}
ProbingState st = ProbingState.NotMe;
for (int i = 0; i < probers.Length; i++) {
if (!isActive[i])
continue;
st = probers[i].HandleData(highbyteBuf, 0, hptr);
if (st == ProbingState.FoundIt) {
bestGuess = i;
state = ProbingState.FoundIt;
break;
} else if (st == ProbingState.NotMe) {
isActive[i] = false;
activeNum--;
if (activeNum <= 0) {
state = ProbingState.NotMe;
break;
}
}
}
return state;
}
public override float GetConfidence()
{
float bestConf = 0.0f;
float cf = 0.0f;
if (state == ProbingState.FoundIt) {
return 0.99f;
} else if (state == ProbingState.NotMe) {
return 0.01f;
} else {
for (int i = 0; i < PROBERS_NUM; i++) {
if (!isActive[i])
continue;
cf = probers[i].GetConfidence();
if (bestConf < cf) {
bestConf = cf;
bestGuess = i;
}
}
}
return bestConf;
}
public override void DumpStatus()
{
float cf;
GetConfidence();
for (int i = 0; i < PROBERS_NUM; i++) {
if (!isActive[i]) {
//Console.WriteLine(" MBCS inactive: {0} (confidence is too low).", ProberName[i]);
} else {
cf = probers[i].GetConfidence();
//Console.WriteLine(" MBCS {0}: [{1}]", cf, ProberName[i]);
}
}
}
}
}

View File

@ -0,0 +1,640 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public class UTF8SMModel : SMModel
{
private readonly static int[] UTF8_cls = {
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 40 - 47
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 48 - 4f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 50 - 57
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 58 - 5f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 60 - 67
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 68 - 6f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 70 - 77
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 78 - 7f
BitPackage.Pack4bits(2,2,2,2,3,3,3,3), // 80 - 87
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 88 - 8f
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 90 - 97
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 98 - 9f
BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // a0 - a7
BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // a8 - af
BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // b0 - b7
BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // b8 - bf
BitPackage.Pack4bits(0,0,6,6,6,6,6,6), // c0 - c7
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // c8 - cf
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // d0 - d7
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // d8 - df
BitPackage.Pack4bits(7,8,8,8,8,8,8,8), // e0 - e7
BitPackage.Pack4bits(8,8,8,8,8,9,8,8), // e8 - ef
BitPackage.Pack4bits(10,11,11,11,11,11,11,11), // f0 - f7
BitPackage.Pack4bits(12,13,13,13,14,15,0,0) // f8 - ff
};
private readonly static int[] UTF8_st = {
BitPackage.Pack4bits(ERROR,START,ERROR,ERROR,ERROR,ERROR, 12, 10),//00-07
BitPackage.Pack4bits( 9, 11, 8, 7, 6, 5, 4, 3),//08-0f
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//10-17
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//18-1f
BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME),//20-27
BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME),//28-2f
BitPackage.Pack4bits(ERROR,ERROR, 5, 5, 5, 5,ERROR,ERROR),//30-37
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//38-3f
BitPackage.Pack4bits(ERROR,ERROR,ERROR, 5, 5, 5,ERROR,ERROR),//40-47
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//48-4f
BitPackage.Pack4bits(ERROR,ERROR, 7, 7, 7, 7,ERROR,ERROR),//50-57
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//58-5f
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR, 7, 7,ERROR,ERROR),//60-67
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//68-6f
BitPackage.Pack4bits(ERROR,ERROR, 9, 9, 9, 9,ERROR,ERROR),//70-77
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//78-7f
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR, 9,ERROR,ERROR),//80-87
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//88-8f
BitPackage.Pack4bits(ERROR,ERROR, 12, 12, 12, 12,ERROR,ERROR),//90-97
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//98-9f
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR, 12,ERROR,ERROR),//a0-a7
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//a8-af
BitPackage.Pack4bits(ERROR,ERROR, 12, 12, 12,ERROR,ERROR,ERROR),//b0-b7
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//b8-bf
BitPackage.Pack4bits(ERROR,ERROR,START,START,START,START,ERROR,ERROR),//c0-c7
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR) //c8-cf
};
private readonly static int[] UTF8CharLenTable =
{0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6 };
public UTF8SMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, UTF8_cls),
16,
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, UTF8_st),
UTF8CharLenTable, "UTF-8")
{
}
}
public class GB18030SMModel : SMModel
{
private readonly static int[] GB18030_cls = {
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 30 - 37
BitPackage.Pack4bits(3,3,1,1,1,1,1,1), // 38 - 3f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77
BitPackage.Pack4bits(2,2,2,2,2,2,2,4), // 78 - 7f
BitPackage.Pack4bits(5,6,6,6,6,6,6,6), // 80 - 87
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 88 - 8f
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 90 - 97
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 98 - 9f
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // a0 - a7
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // a8 - af
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // b0 - b7
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // b8 - bf
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // c0 - c7
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // c8 - cf
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // d0 - d7
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // d8 - df
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // e0 - e7
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // e8 - ef
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // f0 - f7
BitPackage.Pack4bits(6,6,6,6,6,6,6,0) // f8 - ff
};
private readonly static int[] GB18030_st = {
BitPackage.Pack4bits(ERROR,START,START,START,START,START, 3,ERROR),//00-07
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//08-0f
BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ITSME,ERROR,ERROR,START),//10-17
BitPackage.Pack4bits( 4,ERROR,START,START,ERROR,ERROR,ERROR,ERROR),//18-1f
BitPackage.Pack4bits(ERROR,ERROR, 5,ERROR,ERROR,ERROR,ITSME,ERROR),//20-27
BitPackage.Pack4bits(ERROR,ERROR,START,START,START,START,START,START) //28-2f
};
// To be accurate, the length of class 6 can be either 2 or 4.
// But it is not necessary to discriminate between the two since
// it is used for frequency analysis only, and we are validating
// each code range there as well. So it is safe to set it to be
// 2 here.
private readonly static int[] GB18030CharLenTable = {0, 1, 1, 1, 1, 1, 2};
public GB18030SMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, GB18030_cls),
7,
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, GB18030_st),
GB18030CharLenTable, "GB18030")
{
}
}
public class BIG5SMModel : SMModel
{
private readonly static int[] BIG5_cls = {
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77
BitPackage.Pack4bits(2,2,2,2,2,2,2,1), // 78 - 7f
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 80 - 87
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 88 - 8f
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 90 - 97
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 98 - 9f
BitPackage.Pack4bits(4,3,3,3,3,3,3,3), // a0 - a7
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // a8 - af
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // b0 - b7
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // b8 - bf
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c0 - c7
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c8 - cf
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d0 - d7
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d8 - df
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e8 - ef
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // f0 - f7
BitPackage.Pack4bits(3,3,3,3,3,3,3,0) // f8 - ff
};
private readonly static int[] BIG5_st = {
BitPackage.Pack4bits(ERROR,START,START, 3,ERROR,ERROR,ERROR,ERROR),//00-07
BitPackage.Pack4bits(ERROR,ERROR,ITSME,ITSME,ITSME,ITSME,ITSME,ERROR),//08-0f
BitPackage.Pack4bits(ERROR,START,START,START,START,START,START,START) //10-17
};
private readonly static int[] BIG5CharLenTable = {0, 1, 1, 2, 0};
public BIG5SMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, BIG5_cls),
5,
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, BIG5_st),
BIG5CharLenTable, "Big5")
{
}
}
public class EUCJPSMModel : SMModel
{
private readonly static int[] EUCJP_cls = {
//BitPacket.Pack4bits(5,4,4,4,4,4,4,4), // 00 - 07
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 00 - 07
BitPackage.Pack4bits(4,4,4,4,4,4,5,5), // 08 - 0f
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 10 - 17
BitPackage.Pack4bits(4,4,4,5,4,4,4,4), // 18 - 1f
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 20 - 27
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 28 - 2f
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 30 - 37
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 38 - 3f
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 40 - 47
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 48 - 4f
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 50 - 57
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 58 - 5f
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 60 - 67
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 68 - 6f
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 70 - 77
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 78 - 7f
BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 80 - 87
BitPackage.Pack4bits(5,5,5,5,5,5,1,3), // 88 - 8f
BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 90 - 97
BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 98 - 9f
BitPackage.Pack4bits(5,2,2,2,2,2,2,2), // a0 - a7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7
BitPackage.Pack4bits(0,0,0,0,0,0,0,5) // f8 - ff
};
private readonly static int[] EUCJP_st = {
BitPackage.Pack4bits( 3, 4, 3, 5,START,ERROR,ERROR,ERROR),//00-07
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
BitPackage.Pack4bits(ITSME,ITSME,START,ERROR,START,ERROR,ERROR,ERROR),//10-17
BitPackage.Pack4bits(ERROR,ERROR,START,ERROR,ERROR,ERROR, 3,ERROR),//18-1f
BitPackage.Pack4bits( 3,ERROR,ERROR,ERROR,START,START,START,START) //20-27
};
private readonly static int[] EUCJPCharLenTable = { 2, 2, 2, 3, 1, 0 };
public EUCJPSMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, EUCJP_cls),
6,
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, EUCJP_st),
EUCJPCharLenTable, "EUC-JP")
{
}
}
public class EUCKRSMModel : SMModel
{
private readonly static int[] EUCKR_cls = {
//BitPacket.Pack4bits(0,1,1,1,1,1,1,1), // 00 - 07
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 40 - 47
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 48 - 4f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 50 - 57
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 58 - 5f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 60 - 67
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 68 - 6f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 70 - 77
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 78 - 7f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f
BitPackage.Pack4bits(0,2,2,2,2,2,2,2), // a0 - a7
BitPackage.Pack4bits(2,2,2,2,2,3,3,3), // a8 - af
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
BitPackage.Pack4bits(2,3,2,2,2,2,2,2), // c8 - cf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
BitPackage.Pack4bits(2,2,2,2,2,2,2,0) // f8 - ff
};
private readonly static int[] EUCKR_st = {
BitPackage.Pack4bits(ERROR,START, 3,ERROR,ERROR,ERROR,ERROR,ERROR),//00-07
BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ERROR,ERROR,START,START) //08-0f
};
private readonly static int[] EUCKRCharLenTable = { 0, 1, 2, 0 };
public EUCKRSMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, EUCKR_cls),
4,
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, EUCKR_st),
EUCKRCharLenTable, "EUC-KR")
{
}
}
public class EUCTWSMModel : SMModel
{
private readonly static int[] EUCTW_cls = {
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 00 - 07
BitPackage.Pack4bits(2,2,2,2,2,2,0,0), // 08 - 0f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 10 - 17
BitPackage.Pack4bits(2,2,2,0,2,2,2,2), // 18 - 1f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 20 - 27
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 28 - 2f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 30 - 37
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 38 - 3f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 78 - 7f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87
BitPackage.Pack4bits(0,0,0,0,0,0,6,0), // 88 - 8f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f
BitPackage.Pack4bits(0,3,4,4,4,4,4,4), // a0 - a7
BitPackage.Pack4bits(5,5,1,1,1,1,1,1), // a8 - af
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b0 - b7
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b8 - bf
BitPackage.Pack4bits(1,1,3,1,3,3,3,3), // c0 - c7
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c8 - cf
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d0 - d7
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d8 - df
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e8 - ef
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // f0 - f7
BitPackage.Pack4bits(3,3,3,3,3,3,3,0) // f8 - ff
};
private readonly static int[] EUCTW_st = {
BitPackage.Pack4bits(ERROR,ERROR,START, 3, 3, 3, 4,ERROR),//00-07
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//08-0f
BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ITSME,ERROR,START,ERROR),//10-17
BitPackage.Pack4bits(START,START,START,ERROR,ERROR,ERROR,ERROR,ERROR),//18-1f
BitPackage.Pack4bits( 5,ERROR,ERROR,ERROR,START,ERROR,START,START),//20-27
BitPackage.Pack4bits(START,ERROR,START,START,START,START,START,START) //28-2f
};
private readonly static int[] EUCTWCharLenTable = { 0, 0, 1, 2, 2, 2, 3 };
public EUCTWSMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, EUCTW_cls),
7,
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, EUCTW_st),
EUCTWCharLenTable, "EUC-TW")
{
}
}
public class SJISSMModel : SMModel
{
private readonly static int[] SJIS_cls = {
//BitPacket.Pack4bits(0,1,1,1,1,1,1,1), // 00 - 07
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77
BitPackage.Pack4bits(2,2,2,2,2,2,2,1), // 78 - 7f
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 80 - 87
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 88 - 8f
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 90 - 97
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 98 - 9f
//0xa0 is illegal in sjis encoding, but some pages does
//contain such byte. We need to be more error forgiven.
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7
BitPackage.Pack4bits(3,3,3,3,3,4,4,4), // e8 - ef
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // f0 - f7
BitPackage.Pack4bits(4,4,4,4,4,0,0,0) // f8 - ff
};
private readonly static int[] SJIS_st = {
BitPackage.Pack4bits(ERROR,START,START, 3,ERROR,ERROR,ERROR,ERROR),//00-07
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
BitPackage.Pack4bits(ITSME,ITSME,ERROR,ERROR,START,START,START,START) //10-17
};
private readonly static int[] SJISCharLenTable = { 0, 1, 1, 2, 0, 0 };
public SJISSMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, SJIS_cls),
6,
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, SJIS_st),
SJISCharLenTable, "Shift_JIS")
{
}
}
public class UCS2BESMModel : SMModel
{
private readonly static int[] UCS2BE_cls = {
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 00 - 07
BitPackage.Pack4bits(0,0,1,0,0,2,0,0), // 08 - 0f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
BitPackage.Pack4bits(0,0,0,3,0,0,0,0), // 18 - 1f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
BitPackage.Pack4bits(0,3,3,3,3,3,0,0), // 28 - 2f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a0 - a7
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a8 - af
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b0 - b7
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b8 - bf
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c0 - c7
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c8 - cf
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d0 - d7
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d8 - df
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7
BitPackage.Pack4bits(0,0,0,0,0,0,4,5) // f8 - ff
};
private readonly static int[] UCS2BE_st = {
BitPackage.Pack4bits( 5, 7, 7,ERROR, 4, 3,ERROR,ERROR),//00-07
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
BitPackage.Pack4bits(ITSME,ITSME, 6, 6, 6, 6,ERROR,ERROR),//10-17
BitPackage.Pack4bits( 6, 6, 6, 6, 6,ITSME, 6, 6),//18-1f
BitPackage.Pack4bits( 6, 6, 6, 6, 5, 7, 7,ERROR),//20-27
BitPackage.Pack4bits( 5, 8, 6, 6,ERROR, 6, 6, 6),//28-2f
BitPackage.Pack4bits( 6, 6, 6, 6,ERROR,ERROR,START,START) //30-37
};
private readonly static int[] UCS2BECharLenTable = { 2, 2, 2, 0, 2, 2 };
public UCS2BESMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, UCS2BE_cls),
6,
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, UCS2BE_st),
UCS2BECharLenTable, "UTF-16BE")
{
}
}
public class UCS2LESMModel : SMModel
{
private readonly static int[] UCS2LE_cls = {
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 00 - 07
BitPackage.Pack4bits(0,0,1,0,0,2,0,0), // 08 - 0f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
BitPackage.Pack4bits(0,0,0,3,0,0,0,0), // 18 - 1f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
BitPackage.Pack4bits(0,3,3,3,3,3,0,0), // 28 - 2f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a0 - a7
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a8 - af
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b0 - b7
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b8 - bf
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c0 - c7
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c8 - cf
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d0 - d7
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d8 - df
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7
BitPackage.Pack4bits(0,0,0,0,0,0,4,5) // f8 - ff
};
private readonly static int[] UCS2LE_st = {
BitPackage.Pack4bits( 6, 6, 7, 6, 4, 3,ERROR,ERROR),//00-07
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
BitPackage.Pack4bits(ITSME,ITSME, 5, 5, 5,ERROR,ITSME,ERROR),//10-17
BitPackage.Pack4bits( 5, 5, 5,ERROR, 5,ERROR, 6, 6),//18-1f
BitPackage.Pack4bits( 7, 6, 8, 8, 5, 5, 5,ERROR),//20-27
BitPackage.Pack4bits( 5, 5, 5,ERROR,ERROR,ERROR, 5, 5),//28-2f
BitPackage.Pack4bits( 5, 5, 5,ERROR, 5,ERROR,START,START) //30-37
};
private readonly static int[] UCS2LECharLenTable = { 2, 2, 2, 2, 2, 2 };
public UCS2LESMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, UCS2LE_cls),
6,
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, UCS2LE_st),
UCS2LECharLenTable, "UTF-16LE")
{
}
}
}

View File

@ -0,0 +1,180 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
using System;
namespace UniversalDetector.Core
{
public class SBCSGroupProber : CharsetProber
{
private const int PROBERS_NUM = 13;
private CharsetProber[] probers = new CharsetProber[PROBERS_NUM];
private bool[] isActive = new bool[PROBERS_NUM];
private int bestGuess;
private int activeNum;
public SBCSGroupProber()
{
probers[0] = new SingleByteCharSetProber(new Win1251Model());
probers[1] = new SingleByteCharSetProber(new Koi8rModel());
probers[2] = new SingleByteCharSetProber(new Latin5Model());
probers[3] = new SingleByteCharSetProber(new MacCyrillicModel());
probers[4] = new SingleByteCharSetProber(new Ibm866Model());
probers[5] = new SingleByteCharSetProber(new Ibm855Model());
probers[6] = new SingleByteCharSetProber(new Latin7Model());
probers[7] = new SingleByteCharSetProber(new Win1253Model());
probers[8] = new SingleByteCharSetProber(new Latin5BulgarianModel());
probers[9] = new SingleByteCharSetProber(new Win1251BulgarianModel());
HebrewProber hebprober = new HebrewProber();
probers[10] = hebprober;
// Logical
probers[11] = new SingleByteCharSetProber(new Win1255Model(), false, hebprober);
// Visual
probers[12] = new SingleByteCharSetProber(new Win1255Model(), true, hebprober);
hebprober.SetModelProbers(probers[11], probers[12]);
// disable latin2 before latin1 is available, otherwise all latin1
// will be detected as latin2 because of their similarity.
//probers[13] = new SingleByteCharSetProber(new Latin2HungarianModel());
//probers[14] = new SingleByteCharSetProber(new Win1250HungarianModel());
Reset();
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
ProbingState st = ProbingState.NotMe;
//apply filter to original buffer, and we got new buffer back
//depend on what script it is, we will feed them the new buffer
//we got after applying proper filter
//this is done without any consideration to KeepEnglishLetters
//of each prober since as of now, there are no probers here which
//recognize languages with English characters.
byte[] newBuf = FilterWithoutEnglishLetters(buf, offset, len);
if (newBuf.Length == 0)
return state; // Nothing to see here, move on.
for (int i = 0; i < PROBERS_NUM; i++) {
if (!isActive[i])
continue;
st = probers[i].HandleData(newBuf, 0, newBuf.Length);
if (st == ProbingState.FoundIt) {
bestGuess = i;
state = ProbingState.FoundIt;
break;
} else if (st == ProbingState.NotMe) {
isActive[i] = false;
activeNum--;
if (activeNum <= 0) {
state = ProbingState.NotMe;
break;
}
}
}
return state;
}
public override float GetConfidence()
{
float bestConf = 0.0f, cf;
switch (state) {
case ProbingState.FoundIt:
return 0.99f; //sure yes
case ProbingState.NotMe:
return 0.01f; //sure no
default:
for (int i = 0; i < PROBERS_NUM; i++)
{
if (!isActive[i])
continue;
cf = probers[i].GetConfidence();
if (bestConf < cf)
{
bestConf = cf;
bestGuess = i;
}
}
break;
}
return bestConf;
}
public override void DumpStatus()
{
float cf = GetConfidence();
// Console.WriteLine(" SBCS Group Prober --------begin status");
for (int i = 0; i < PROBERS_NUM; i++) {
if (isActive[i])
probers[i].DumpStatus();
//else
//Console.WriteLine(" inactive: [{0}] (i.e. confidence is too low).", probers[i].GetCharsetName());
}
//Console.WriteLine(" SBCS Group found best match [{0}] confidence {1}.", probers[bestGuess].GetCharsetName(), cf);
}
public override void Reset ()
{
int activeNum = 0;
for (int i = 0; i < PROBERS_NUM; i++) {
if (probers[i] != null) {
probers[i].Reset();
isActive[i] = true;
activeNum++;
} else {
isActive[i] = false;
}
}
bestGuess = -1;
state = ProbingState.Detecting;
}
public override string GetCharsetName()
{
//if we have no answer yet
if (bestGuess == -1) {
GetConfidence();
//no charset seems positive
if (bestGuess == -1)
bestGuess = 0;
}
return probers[bestGuess].GetCharsetName();
}
}
}

View File

@ -0,0 +1,170 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
using System;
namespace UniversalDetector.Core
{
public class SingleByteCharSetProber : CharsetProber
{
private const int SAMPLE_SIZE = 64;
private const int SB_ENOUGH_REL_THRESHOLD = 1024;
private const float POSITIVE_SHORTCUT_THRESHOLD = 0.95f;
private const float NEGATIVE_SHORTCUT_THRESHOLD = 0.05f;
private const int SYMBOL_CAT_ORDER = 250;
private const int NUMBER_OF_SEQ_CAT = 4;
private const int POSITIVE_CAT = NUMBER_OF_SEQ_CAT-1;
private const int NEGATIVE_CAT = 0;
protected SequenceModel model;
// true if we need to reverse every pair in the model lookup
bool reversed;
// char order of last character
byte lastOrder;
int totalSeqs;
int totalChar;
int[] seqCounters = new int[NUMBER_OF_SEQ_CAT];
// characters that fall in our sampling range
int freqChar;
// Optional auxiliary prober for name decision. created and destroyed by the GroupProber
CharsetProber nameProber;
public SingleByteCharSetProber(SequenceModel model)
: this(model, false, null)
{
}
public SingleByteCharSetProber(SequenceModel model, bool reversed,
CharsetProber nameProber)
{
this.model = model;
this.reversed = reversed;
this.nameProber = nameProber;
Reset();
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
int max = offset + len;
for (int i = offset; i < max; i++) {
byte order = model.GetOrder(buf[i]);
if (order < SYMBOL_CAT_ORDER)
totalChar++;
if (order < SAMPLE_SIZE) {
freqChar++;
if (lastOrder < SAMPLE_SIZE) {
totalSeqs++;
if (!reversed)
++(seqCounters[model.GetPrecedence(lastOrder*SAMPLE_SIZE+order)]);
else // reverse the order of the letters in the lookup
++(seqCounters[model.GetPrecedence(order*SAMPLE_SIZE+lastOrder)]);
}
}
lastOrder = order;
}
if (state == ProbingState.Detecting) {
if (totalSeqs > SB_ENOUGH_REL_THRESHOLD) {
float cf = GetConfidence();
if (cf > POSITIVE_SHORTCUT_THRESHOLD)
state = ProbingState.FoundIt;
else if (cf < NEGATIVE_SHORTCUT_THRESHOLD)
state = ProbingState.NotMe;
}
}
return state;
}
public override void DumpStatus()
{
//Console.WriteLine(" SBCS: {0} [{1}]", GetConfidence(), GetCharsetName());
}
public override float GetConfidence()
{
/*
NEGATIVE_APPROACH
if (totalSeqs > 0) {
if (totalSeqs > seqCounters[NEGATIVE_CAT] * 10)
return (totalSeqs - seqCounters[NEGATIVE_CAT] * 10)/totalSeqs * freqChar / mTotalChar;
}
return 0.01f;
*/
// POSITIVE_APPROACH
float r = 0.0f;
if (totalSeqs > 0) {
r = 1.0f * seqCounters[POSITIVE_CAT] / totalSeqs / model.TypicalPositiveRatio;
r = r * freqChar / totalChar;
if (r >= 1.0f)
r = 0.99f;
return r;
}
return 0.01f;
}
public override void Reset()
{
state = ProbingState.Detecting;
lastOrder = 255;
for (int i = 0; i < NUMBER_OF_SEQ_CAT; i++)
seqCounters[i] = 0;
totalSeqs = 0;
totalChar = 0;
freqChar = 0;
}
public override string GetCharsetName()
{
return (nameProber == null) ? model.CharsetName
: nameProber.GetCharsetName();
}
}
}

View File

@ -0,0 +1,116 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
/// <summary>
/// for S-JIS encoding, observe characteristic:
/// 1, kana character (or hankaku?) often have hight frequency of appereance
/// 2, kana character often exist in group
/// 3, certain combination of kana is never used in japanese language
/// </summary>
public class SJISProber : CharsetProber
{
private CodingStateMachine codingSM;
private SJISContextAnalyser contextAnalyser;
private SJISDistributionAnalyser distributionAnalyser;
private byte[] lastChar = new byte[2];
public SJISProber()
{
codingSM = new CodingStateMachine(new SJISSMModel());
distributionAnalyser = new SJISDistributionAnalyser();
contextAnalyser = new SJISContextAnalyser();
Reset();
}
public override string GetCharsetName()
{
return "Shift-JIS";
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
int codingState;
int max = offset + len;
for (int i = offset; i < max; i++) {
codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR) {
state = ProbingState.NotMe;
break;
}
if (codingState == SMModel.ITSME) {
state = ProbingState.FoundIt;
break;
}
if (codingState == SMModel.START) {
int charLen = codingSM.CurrentCharLen;
if (i == offset) {
lastChar[1] = buf[offset];
contextAnalyser.HandleOneChar(lastChar, 2-charLen, charLen);
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
} else {
contextAnalyser.HandleOneChar(buf, i+1-charLen, charLen);
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
}
}
}
lastChar[0] = buf[max-1];
if (state == ProbingState.Detecting)
if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
state = ProbingState.FoundIt;
return state;
}
public override void Reset()
{
codingSM.Reset();
state = ProbingState.Detecting;
contextAnalyser.Reset();
distributionAnalyser.Reset();
}
public override float GetConfidence()
{
float contxtCf = contextAnalyser.GetConfidence();
float distribCf = distributionAnalyser.GetConfidence();
return (contxtCf > distribCf ? contxtCf : distribCf);
}
}
}

View File

@ -0,0 +1,83 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Kohei TAKETA <k-tak@void.in> (Java port)
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
using System;
namespace UniversalDetector.Core
{
/// <summary>
/// State machine model
/// </summary>
public abstract class SMModel
{
public const int START = 0;
public const int ERROR = 1;
public const int ITSME = 2;
public BitPackage classTable;
public BitPackage stateTable;
public int[] charLenTable;
private string name;
public string Name {
get { return name; }
}
private int classFactor;
public int ClassFactor {
get { return classFactor; }
}
public SMModel(BitPackage classTable, int classFactor,
BitPackage stateTable, int[] charLenTable, String name)
{
this.classTable = classTable;
this.classFactor = classFactor;
this.stateTable = stateTable;
this.charLenTable = charLenTable;
this.name = name;
}
public int GetClass(byte b)
{
return classTable.Unpack((int)b);
}
}
}

View File

@ -0,0 +1,97 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
using System;
namespace UniversalDetector.Core
{
public abstract class SequenceModel
{
// [256] table use to find a char's order
protected byte[] charToOrderMap;
// [SAMPLE_SIZE][SAMPLE_SIZE] table to find a 2-char sequence's
// frequency
protected byte[] precedenceMatrix;
// freqSeqs / totalSeqs
protected float typicalPositiveRatio;
public float TypicalPositiveRatio {
get { return typicalPositiveRatio; }
}
// not used
protected bool keepEnglishLetter;
public bool KeepEnglishLetter {
get { return keepEnglishLetter; }
}
protected String charsetName;
public string CharsetName {
get { return charsetName; }
}
public SequenceModel(
byte[] charToOrderMap,
byte[] precedenceMatrix,
float typicalPositiveRatio,
bool keepEnglishLetter,
String charsetName)
{
this.charToOrderMap = charToOrderMap;
this.precedenceMatrix = precedenceMatrix;
this.typicalPositiveRatio = typicalPositiveRatio;
this.keepEnglishLetter = keepEnglishLetter;
this.charsetName = charsetName;
}
public byte GetOrder(byte b)
{
return charToOrderMap[b];
}
public byte GetPrecedence(int pos)
{
return precedenceMatrix[pos];
}
}
}

View File

@ -0,0 +1,112 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public class UTF8Prober : CharsetProber
{
private static float ONE_CHAR_PROB = 0.50f;
private CodingStateMachine codingSM;
private int numOfMBChar;
public UTF8Prober()
{
numOfMBChar = 0;
codingSM = new CodingStateMachine(new UTF8SMModel());
Reset();
}
public override string GetCharsetName() {
return "UTF-8";
}
public override void Reset()
{
codingSM.Reset();
numOfMBChar = 0;
state = ProbingState.Detecting;
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
int codingState = SMModel.START;
int max = offset + len;
for (int i = offset; i < max; i++) {
codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR) {
state = ProbingState.NotMe;
break;
}
if (codingState == SMModel.ITSME) {
state = ProbingState.FoundIt;
break;
}
if (codingState == SMModel.START) {
if (codingSM.CurrentCharLen >= 2)
numOfMBChar++;
}
}
if (state == ProbingState.Detecting)
if (GetConfidence() > SHORTCUT_THRESHOLD)
state = ProbingState.FoundIt;
return state;
}
public override float GetConfidence()
{
float unlike = 0.99f;
float confidence = 0.0f;
if (numOfMBChar < 6) {
for (int i = 0; i < numOfMBChar; i++)
unlike *= ONE_CHAR_PROB;
confidence = 1.0f - unlike;
} else {
confidence = 0.99f;
}
return confidence;
}
}
}

View File

@ -0,0 +1,257 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
enum InputState { PureASCII=0, EscASCII=1, Highbyte=2 };
public abstract class UniversalDetector
{
protected const int FILTER_CHINESE_SIMPLIFIED = 1;
protected const int FILTER_CHINESE_TRADITIONAL = 2;
protected const int FILTER_JAPANESE = 4;
protected const int FILTER_KOREAN = 8;
protected const int FILTER_NON_CJK = 16;
protected const int FILTER_ALL = 31;
protected static int FILTER_CHINESE =
FILTER_CHINESE_SIMPLIFIED | FILTER_CHINESE_TRADITIONAL;
protected static int FILTER_CJK =
FILTER_JAPANESE | FILTER_KOREAN | FILTER_CHINESE_SIMPLIFIED
| FILTER_CHINESE_TRADITIONAL;
protected const float SHORTCUT_THRESHOLD = 0.95f;
protected const float MINIMUM_THRESHOLD = 0.20f;
internal InputState inputState;
protected bool start;
protected bool gotData;
protected bool done;
protected byte lastChar;
protected int bestGuess;
protected const int PROBERS_NUM = 3;
protected int languageFilter;
protected CharsetProber[] charsetProbers = new CharsetProber[PROBERS_NUM];
protected CharsetProber escCharsetProber;
protected string detectedCharset;
public UniversalDetector(int languageFilter) {
this.start = true;
this.inputState = InputState.PureASCII;
this.lastChar = 0x00;
this.bestGuess = -1;
this.languageFilter = languageFilter;
}
public virtual void Feed(byte[] buf, int offset, int len)
{
if (done) {
return;
}
if (len > 0)
gotData = true;
// If the data starts with BOM, we know it is UTF
if (start) {
start = false;
if (len > 3) {
switch (buf[0]) {
case 0xEF:
if (0xBB == buf[1] && 0xBF == buf[2])
detectedCharset = "UTF-8";
break;
case 0xFE:
if (0xFF == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
detectedCharset = "X-ISO-10646-UCS-4-3412";
else if (0xFF == buf[1])
detectedCharset = "UTF-16BE";
break;
case 0x00:
if (0x00 == buf[1] && 0xFE == buf[2] && 0xFF == buf[3])
detectedCharset = "UTF-32BE";
else if (0x00 == buf[1] && 0xFF == buf[2] && 0xFE == buf[3])
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
detectedCharset = "X-ISO-10646-UCS-4-2143";
break;
case 0xFF:
if (0xFE == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
detectedCharset = "UTF-32LE";
else if (0xFE == buf[1])
detectedCharset = "UTF-16LE";
break;
} // switch
}
if (detectedCharset != null) {
done = true;
return;
}
}
for (int i = 0; i < len; i++) {
// other than 0xa0, if every other character is ascii, the page is ascii
if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0) {
// we got a non-ascii byte (high-byte)
if (inputState != InputState.Highbyte) {
inputState = InputState.Highbyte;
// kill EscCharsetProber if it is active
if (escCharsetProber != null) {
escCharsetProber = null;
}
// start multibyte and singlebyte charset prober
if (charsetProbers[0] == null)
charsetProbers[0] = new MBCSGroupProber();
if (charsetProbers[1] == null)
charsetProbers[1] = new SBCSGroupProber();
if (charsetProbers[2] == null)
charsetProbers[2] = new Latin1Prober();
}
} else {
if (inputState == InputState.PureASCII &&
(buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E))) {
// found escape character or HZ "~{"
inputState = InputState.EscASCII;
}
lastChar = buf[i];
}
}
ProbingState st = ProbingState.NotMe;
switch (inputState) {
case InputState.EscASCII:
if (escCharsetProber == null) {
escCharsetProber = new EscCharsetProber();
}
st = escCharsetProber.HandleData(buf, offset, len);
if (st == ProbingState.FoundIt) {
done = true;
detectedCharset = escCharsetProber.GetCharsetName();
}
break;
case InputState.Highbyte:
for (int i = 0; i < PROBERS_NUM; i++) {
if (charsetProbers[i] != null) {
st = charsetProbers[i].HandleData(buf, offset, len);
#if DEBUG
charsetProbers[i].DumpStatus();
#endif
if (st == ProbingState.FoundIt) {
done = true;
detectedCharset = charsetProbers[i].GetCharsetName();
return;
}
}
}
break;
default:
// pure ascii
break;
}
return;
}
/// <summary>
/// Notify detector that no further data is available.
/// </summary>
public virtual void DataEnd()
{
if (!gotData) {
// we haven't got any data yet, return immediately
// caller program sometimes call DataEnd before anything has
// been sent to detector
return;
}
if (detectedCharset != null) {
done = true;
Report(detectedCharset, 1.0f);
return;
}
if (inputState == InputState.Highbyte) {
float proberConfidence = 0.0f;
float maxProberConfidence = 0.0f;
int maxProber = 0;
for (int i = 0; i < PROBERS_NUM; i++) {
if (charsetProbers[i] != null) {
proberConfidence = charsetProbers[i].GetConfidence();
if (proberConfidence > maxProberConfidence) {
maxProberConfidence = proberConfidence;
maxProber = i;
}
}
}
if (maxProberConfidence > MINIMUM_THRESHOLD) {
Report(charsetProbers[maxProber].GetCharsetName(), maxProberConfidence);
}
} else if (inputState == InputState.PureASCII) {
Report("ASCII", 1.0f);
}
}
/// <summary>
/// Clear internal state of charset detector.
/// In the original interface this method is protected.
/// </summary>
public virtual void Reset()
{
done = false;
start = true;
detectedCharset = null;
gotData = false;
bestGuess = -1;
inputState = InputState.PureASCII;
lastChar = 0x00;
if (escCharsetProber != null)
escCharsetProber.Reset();
for (int i = 0; i < PROBERS_NUM; i++)
if (charsetProbers[i] != null)
charsetProbers[i].Reset();
}
protected abstract void Report(string charset, float confidence);
}
}

View File

@ -0,0 +1,75 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either of the GNU General Public License Version 2 or later (the "GPL"),
* or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector
{
/// <summary>
/// Indicate how confident the detection module about the return result.
///
/// NoAnswerYet: the detector have not find out a answer yet based on
/// the data it received.
///
/// BestAnswer: the answer the detector returned is the best one within
/// the knowledge of the detector. In other words, the test to all
/// other candidates fail.
/// For example, the (Shift_JIS/EUC-JP/ISO-2022-JP) detection
/// module may return this with answer "Shift_JIS " if it receive
/// bytes > 0x80 (which make ISO-2022-JP test failed) and byte
/// 0x82 (which may EUC-JP test failed)
///
/// SureAnswer: the detector is 100% sure about the answer.
///
/// Example 1: the Shift_JIS/ISO-2022-JP/EUC-JP detector return
/// this w/ ISO-2022-JP when it hit one of the following ESC seq
/// ESC ( J
/// ESC $ @
/// ESC $ B
///
/// Example 2: the detector which can detect UCS2 return w/ UCS2
/// when the first 2 byte are BOM mark.
/// Example 3: the Korean detector return ISO-2022-KR when it
/// hit ESC $ ) C
/// </summary>
public enum DetectionConfidence
{
NoAnswerYet = 0,
BestAnswer,
SureAnswer,
NoAnswerMatch
}
}

View File

@ -0,0 +1,88 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
using System.IO;
namespace UniversalDetector
{
public interface ICharsetDetector
{
/// <summary>
/// The detected charset. It can be null.
/// </summary>
string Charset { get; }
/// <summary>
/// The confidence of the detected charset, if any
/// </summary>
float Confidence { get; }
/// <summary>
/// Feed a block of bytes to the detector.
/// </summary>
/// <param name="buf">input buffer</param>
/// <param name="offset">offset into buffer</param>
/// <param name="len">number of available bytes</param>
void Feed(byte[] buf, int offset, int len);
/// <summary>
/// Feed a bytes stream to the detector.
/// </summary>
/// <param name="stream">an input stream</param>
void Feed(Stream stream);
/// <summary>
/// Resets the state of the detector.
/// </summary>
void Reset();
/// <summary>
/// Returns true if the detector has found a result and it is sure about it.
/// </summary>
/// <returns>true if the detector has detected the encoding</returns>
bool IsDone();
/// <summary>
/// Tell the detector that there is no more data and it must take its
/// decision.
/// </summary>
void DataEnd();
}
}

View File

@ -564,7 +564,7 @@ namespace Emby.Server.Core
StringExtensions.LocalizationManager = LocalizationManager;
RegisterSingleInstance(LocalizationManager);
ITextEncoding textEncoding = new TextEncoding(FileSystemManager);
ITextEncoding textEncoding = new TextEncoding(FileSystemManager, LogManager.GetLogger("TextEncoding"));
RegisterSingleInstance(textEncoding);
Utilities.EncodingHelper = textEncoding;
RegisterSingleInstance<IBlurayExaminer>(() => new BdInfoExaminer(FileSystemManager, textEncoding));

View File

@ -1990,24 +1990,6 @@ namespace Emby.Server.Implementations.Data
}
index++;
if (string.IsNullOrWhiteSpace(item.Tagline))
{
var movie = item as Movie;
if (movie != null && movie.Taglines.Count > 0)
{
movie.Tagline = movie.Taglines[0];
}
}
if (type == typeof(Person) && item.ProductionLocations.Count == 0)
{
var person = (Person)item;
if (!string.IsNullOrWhiteSpace(person.PlaceOfBirth))
{
item.ProductionLocations = new List<string> { person.PlaceOfBirth };
}
}
return item;
}

View File

@ -311,10 +311,6 @@
<HintPath>..\packages\SQLitePCLRaw.core.1.1.2\lib\portable-net45+netcore45+wpa81+MonoAndroid10+MonoTouch10+Xamarin.iOS10\SQLitePCLRaw.core.dll</HintPath>
<Private>True</Private>
</Reference>
<Reference Include="UniversalDetector, Version=1.0.0.0, Culture=neutral, processorArchitecture=MSIL">
<HintPath>..\packages\UniversalDetector.1.0.1\lib\portable-net45+sl4+wp71+win8+wpa81\UniversalDetector.dll</HintPath>
<Private>True</Private>
</Reference>
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="Localization\Core\ar.json" />

View File

@ -13,7 +13,6 @@ using MediaBrowser.Common.IO;
using MediaBrowser.Model.IO;
using MediaBrowser.Model.Services;
using MediaBrowser.Model.Text;
using UniversalDetector;
namespace Emby.Server.Implementations.ServerManager
{
@ -137,7 +136,8 @@ namespace Emby.Server.Implementations.ServerManager
{
return;
}
var charset = DetectCharset(bytes);
var charset = _textEncoding.GetDetectedEncodingName(bytes, null);
if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
{
@ -148,33 +148,6 @@ namespace Emby.Server.Implementations.ServerManager
OnReceiveInternal(_textEncoding.GetASCIIEncoding().GetString(bytes, 0, bytes.Length));
}
}
private string DetectCharset(byte[] bytes)
{
try
{
using (var ms = _memoryStreamProvider.CreateNew(bytes))
{
var detector = new CharsetDetector();
detector.Feed(ms);
detector.DataEnd();
var charset = detector.Charset;
if (!string.IsNullOrWhiteSpace(charset))
{
//_logger.Debug("UniversalDetector detected charset {0}", charset);
}
return charset;
}
}
catch (IOException ex)
{
_logger.ErrorException("Error attempting to determine web socket message charset", ex);
}
return null;
}
private void OnReceiveInternal(string message)
{

View File

@ -4,5 +4,4 @@
<package id="MediaBrowser.Naming" version="1.0.5" targetFramework="portable45-net45+win8" />
<package id="SQLitePCL.pretty" version="1.1.0" targetFramework="portable45-net45+win8" />
<package id="SQLitePCLRaw.core" version="1.1.2" targetFramework="portable45-net45+win8" />
<package id="UniversalDetector" version="1.0.1" targetFramework="portable45-net45+win8" />
</packages>

View File

@ -27,24 +27,15 @@ namespace MediaBrowser.Controller.Entities.Movies
RemoteTrailers = new List<MediaUrl>();
LocalTrailerIds = new List<Guid>();
RemoteTrailerIds = new List<Guid>();
Taglines = new List<string>();
}
public string AwardSummary { get; set; }
public float? Metascore { get; set; }
public List<Guid> LocalTrailerIds { get; set; }
public List<Guid> RemoteTrailerIds { get; set; }
public List<MediaUrl> RemoteTrailers { get; set; }
/// <summary>
/// Gets or sets the taglines.
/// </summary>
/// <value>The taglines.</value>
public List<string> Taglines { get; set; }
/// <summary>
/// Gets or sets the name of the TMDB collection.
/// </summary>

View File

@ -15,12 +15,6 @@ namespace MediaBrowser.Controller.Entities
/// </summary>
public class Person : BaseItem, IItemByName, IHasLookupInfo<PersonLookupInfo>
{
/// <summary>
/// Gets or sets the place of birth.
/// </summary>
/// <value>The place of birth.</value>
public string PlaceOfBirth { get; set; }
public override List<string> GetUserDataKeys()
{
var list = base.GetUserDataKeys();

View File

@ -36,12 +36,6 @@
<OutputPath>bin\Release Mono</OutputPath>
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<ItemGroup>
<Reference Include="UniversalDetector, Version=1.0.0.0, Culture=neutral, processorArchitecture=MSIL">
<HintPath>..\packages\UniversalDetector.1.0.1\lib\portable-net45+sl4+wp71+win8+wpa81\UniversalDetector.dll</HintPath>
<Private>True</Private>
</Reference>
</ItemGroup>
<ItemGroup>
<Compile Include="..\SharedVersion.cs">
<Link>Properties\SharedVersion.cs</Link>

View File

@ -21,7 +21,6 @@ using MediaBrowser.Model.IO;
using MediaBrowser.Model.Diagnostics;
using MediaBrowser.Model.Dto;
using MediaBrowser.Model.Text;
using UniversalDetector;
namespace MediaBrowser.MediaEncoding.Subtitles
{
@ -197,17 +196,20 @@ namespace MediaBrowser.MediaEncoding.Subtitles
{
if (requiresCharset)
{
var charset = await GetSubtitleFileCharacterSet(path, language, protocol, cancellationToken).ConfigureAwait(false);
var bytes = await GetBytes(path, protocol, cancellationToken).ConfigureAwait(false);
var charset = _textEncoding.GetDetectedEncodingName(bytes, language);
_logger.Debug("charset {0} detected for {1}", charset ?? "null", path);
if (!string.IsNullOrEmpty(charset))
{
using (var fs = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false))
using (var inputStream = _memoryStreamProvider.CreateNew(bytes))
{
using (var reader = new StreamReader(fs, GetEncoding(charset)))
using (var reader = new StreamReader(inputStream, _textEncoding.GetEncodingFromCharset(charset)))
{
var text = await reader.ReadToEndAsync().ConfigureAwait(false);
var bytes = Encoding.UTF8.GetBytes(text);
bytes = Encoding.UTF8.GetBytes(text);
return _memoryStreamProvider.CreateNew(bytes);
}
@ -218,28 +220,6 @@ namespace MediaBrowser.MediaEncoding.Subtitles
return _fileSystem.OpenRead(path);
}
private Encoding GetEncoding(string charset)
{
if (string.IsNullOrWhiteSpace(charset))
{
throw new ArgumentNullException("charset");
}
_logger.Debug("Getting encoding object for character set: {0}", charset);
try
{
return Encoding.GetEncoding(charset);
}
catch (ArgumentException)
{
charset = charset.Replace("-", string.Empty);
_logger.Debug("Getting encoding object for character set: {0}", charset);
return Encoding.GetEncoding(charset);
}
}
private async Task<Tuple<string, MediaProtocol, string, bool>> GetReadableFile(string mediaPath,
string[] inputFiles,
MediaProtocol protocol,
@ -724,126 +704,33 @@ namespace MediaBrowser.MediaEncoding.Subtitles
public async Task<string> GetSubtitleFileCharacterSet(string path, string language, MediaProtocol protocol, CancellationToken cancellationToken)
{
if (protocol == MediaProtocol.File)
{
var fileEncoding = _textEncoding.GetFileEncoding(path);
var bytes = await GetBytes(path, protocol, cancellationToken).ConfigureAwait(false);
if (fileEncoding != null && fileEncoding.Equals(Encoding.UTF8))
{
return string.Empty;
}
}
var charset = _textEncoding.GetDetectedEncodingName(bytes, language);
var charset = await DetectCharset(path, language, protocol, cancellationToken).ConfigureAwait(false);
_logger.Debug("charset {0} detected for {1}", charset ?? "null", path);
if (!string.IsNullOrWhiteSpace(charset))
{
if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
{
return null;
}
return charset;
}
if (!string.IsNullOrWhiteSpace(language))
{
return GetSubtitleFileCharacterSetFromLanguage(language);
}
return null;
return charset;
}
public string GetSubtitleFileCharacterSetFromLanguage(string language)
{
// https://developer.xamarin.com/api/type/System.Text.Encoding/
switch (language.ToLower())
{
case "hun":
return "windows-1252";
case "pol":
case "cze":
case "ces":
case "slo":
case "slk":
case "slv":
case "srp":
case "hrv":
case "rum":
case "ron":
case "rup":
case "alb":
case "sqi":
return "windows-1250";
case "ara":
return "windows-1256";
case "heb":
return "windows-1255";
case "grc":
case "gre":
return "windows-1253";
case "crh":
case "ota":
case "tur":
return "windows-1254";
case "rus":
return "windows-1251";
case "vie":
return "windows-1258";
case "kor":
return "cp949";
default:
return "windows-1252";
}
}
private async Task<string> DetectCharset(string path, string language, MediaProtocol protocol, CancellationToken cancellationToken)
{
try
{
using (var file = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false))
{
var detector = new CharsetDetector();
detector.Feed(file);
detector.DataEnd();
var charset = detector.Charset;
if (!string.IsNullOrWhiteSpace(charset))
{
_logger.Info("UniversalDetector detected charset {0} for {1}", charset, path);
}
// This is often incorrectly indetected. If this happens, try to use other techniques instead
if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
{
if (!string.IsNullOrWhiteSpace(language))
{
return null;
}
}
return charset;
}
}
catch (IOException ex)
{
_logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path);
}
return null;
}
private async Task<Stream> GetStream(string path, MediaProtocol protocol, CancellationToken cancellationToken)
private async Task<byte[]> GetBytes(string path, MediaProtocol protocol, CancellationToken cancellationToken)
{
if (protocol == MediaProtocol.Http)
{
return await _httpClient.Get(path, cancellationToken).ConfigureAwait(false);
using (var file = await _httpClient.Get(path, cancellationToken).ConfigureAwait(false))
{
using (var memoryStream = new MemoryStream())
{
await file.CopyToAsync(memoryStream).ConfigureAwait(false);
memoryStream.Position = 0;
return memoryStream.ToArray();
}
}
}
if (protocol == MediaProtocol.File)
{
return _fileSystem.GetFileStream(path, FileOpenMode.Open, FileAccessMode.Read, FileShareMode.ReadWrite);
return _fileSystem.ReadAllBytes(path);
}
throw new ArgumentOutOfRangeException("protocol");

View File

@ -1,4 +1,3 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="UniversalDetector" version="1.0.1" targetFramework="net46" />
</packages>

View File

@ -1,10 +1,14 @@
using System.Text;
using System.IO;
using System.Text;
namespace MediaBrowser.Model.Text
{
public interface ITextEncoding
{
Encoding GetASCIIEncoding();
Encoding GetFileEncoding(string path);
string GetDetectedEncodingName(byte[] bytes, string language);
Encoding GetDetectedEncoding(byte[] bytes, string language);
Encoding GetEncodingFromCharset(string charset);
}
}