/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */

#include "palCUTF8Prober.h"
#include "palCDetectorUtility.h"

const BRUINT32 MW_UTF8_cls [ 256 / 8 ] = {
	//MW_PCK4BITS(0,1,1,1,1,1,1,1),  // 00 - 07 
	MW_PCK4BITS(1,1,1,1,1,1,1,1),  // 00 - 07  //allow 0x00 as a legal value
		MW_PCK4BITS(1,1,1,1,1,1,0,0),  // 08 - 0f 
		MW_PCK4BITS(1,1,1,1,1,1,1,1),  // 10 - 17 
		MW_PCK4BITS(1,1,1,0,1,1,1,1),  // 18 - 1f 
		MW_PCK4BITS(1,1,1,1,1,1,1,1),  // 20 - 27 
		MW_PCK4BITS(1,1,1,1,1,1,1,1),  // 28 - 2f 
		MW_PCK4BITS(1,1,1,1,1,1,1,1),  // 30 - 37 
		MW_PCK4BITS(1,1,1,1,1,1,1,1),  // 38 - 3f 
		MW_PCK4BITS(1,1,1,1,1,1,1,1),  // 40 - 47 
		MW_PCK4BITS(1,1,1,1,1,1,1,1),  // 48 - 4f 
		MW_PCK4BITS(1,1,1,1,1,1,1,1),  // 50 - 57 
		MW_PCK4BITS(1,1,1,1,1,1,1,1),  // 58 - 5f 
		MW_PCK4BITS(1,1,1,1,1,1,1,1),  // 60 - 67 
		MW_PCK4BITS(1,1,1,1,1,1,1,1),  // 68 - 6f 
		MW_PCK4BITS(1,1,1,1,1,1,1,1),  // 70 - 77 
		MW_PCK4BITS(1,1,1,1,1,1,1,1),  // 78 - 7f 
		MW_PCK4BITS(2,2,2,2,3,3,3,3),  // 80 - 87 
		MW_PCK4BITS(4,4,4,4,4,4,4,4),  // 88 - 8f 
		MW_PCK4BITS(4,4,4,4,4,4,4,4),  // 90 - 97 
		MW_PCK4BITS(4,4,4,4,4,4,4,4),  // 98 - 9f 
		MW_PCK4BITS(5,5,5,5,5,5,5,5),  // a0 - a7 
		MW_PCK4BITS(5,5,5,5,5,5,5,5),  // a8 - af 
		MW_PCK4BITS(5,5,5,5,5,5,5,5),  // b0 - b7 
		MW_PCK4BITS(5,5,5,5,5,5,5,5),  // b8 - bf 
		MW_PCK4BITS(0,0,6,6,6,6,6,6),  // c0 - c7 
		MW_PCK4BITS(6,6,6,6,6,6,6,6),  // c8 - cf 
		MW_PCK4BITS(6,6,6,6,6,6,6,6),  // d0 - d7 
		MW_PCK4BITS(6,6,6,6,6,6,6,6),  // d8 - df 
		MW_PCK4BITS(7,8,8,8,8,8,8,8),  // e0 - e7 
		MW_PCK4BITS(8,8,8,8,8,9,8,8),  // e8 - ef 
		MW_PCK4BITS(10,11,11,11,11,11,11,11),  // f0 - f7 
		MW_PCK4BITS(12,13,13,13,14,15,0,0)   // f8 - ff 
};

const BRUINT32 UTF8_st [ 26] = {
	MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,       12,       10),//00-07 
		MW_PCK4BITS(		 9,       11,        8,        7,        6,        5,        4,        3),//08-0f 
		MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error),//10-17 
		MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error),//18-1f 
		MW_PCK4BITS(eSS_ItsMe,eSS_ItsMe,eSS_ItsMe,eSS_ItsMe,eSS_ItsMe,eSS_ItsMe,eSS_ItsMe,eSS_ItsMe),//20-27 
		MW_PCK4BITS(eSS_ItsMe,eSS_ItsMe,eSS_ItsMe,eSS_ItsMe,eSS_ItsMe,eSS_ItsMe,eSS_ItsMe,eSS_ItsMe),//28-2f 
		MW_PCK4BITS(eSS_Error,eSS_Error,        5,        5,        5,        5,eSS_Error,eSS_Error),//30-37 
		MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error),//38-3f 
		MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Error,        5,        5,        5,eSS_Error,eSS_Error),//40-47 
		MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error),//48-4f 
		MW_PCK4BITS(eSS_Error,eSS_Error,		 7,        7,        7,        7,eSS_Error,eSS_Error),//50-57 
		MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error),//58-5f 
		MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Error,eSS_Error,		 7,        7,eSS_Error,eSS_Error),//60-67 
		MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error),//68-6f 
		MW_PCK4BITS(eSS_Error,eSS_Error,		 9,        9,        9,        9,eSS_Error,eSS_Error),//70-77 
		MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error),//78-7f 
		MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,        9,eSS_Error,eSS_Error),//80-87 
		MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error),//88-8f 
		MW_PCK4BITS(eSS_Error,eSS_Error,       12,       12,      12,        12,eSS_Error,eSS_Error),//90-97 
		MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error),//98-9f 
		MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,		  12,eSS_Error,eSS_Error),//a0-a7 
		MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error),//a8-af 
		MW_PCK4BITS(eSS_Error,eSS_Error,		12,		  12,		12,eSS_Error,eSS_Error,eSS_Error),//b0-b7 
		MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error),//b8-bf 
		MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Start,eSS_Start,eSS_Start,eSS_Start,eSS_Error,eSS_Error),//c0-c7 
		MW_PCK4BITS(eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error,eSS_Error) //c8-cf 
};

const BRUINT32 UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6 };


palCUTF8Prober::palCUTF8Prober()
{
	m_pSMModel = KN_MALLOC_EX1(MW_SMModel);

	nsPkgInt* pPkgInt = KN_MALLOC_EX1(nsPkgInt);
	pPkgInt->idxsft = eIdxSft4bits;
	pPkgInt->sftmsk = eSftMsk4bits;
	pPkgInt->bitsft = eBitSft4bits;
	pPkgInt->unitmsk = eUnitMsk4bits;
	pPkgInt->data = MW_UTF8_cls;
	m_pSMModel->classTable = pPkgInt;

	m_pSMModel->classFactor = 16;

	pPkgInt = KN_MALLOC_EX1(nsPkgInt);
	pPkgInt->idxsft = eIdxSft4bits;
	pPkgInt->sftmsk = eSftMsk4bits;
	pPkgInt->bitsft = eBitSft4bits;
	pPkgInt->unitmsk = eUnitMsk4bits;
	pPkgInt->data = UTF8_st;
	m_pSMModel->stateTable = pPkgInt;

	m_pSMModel->charLenTable = UTF8CharLenTable;
	///////////////////////////////////////////////

	m_nNumOfMBChar = 0; 
	//m_pCodingSM = KN_NEW palCCodingStateMachine(&UTF8SMModel);
	m_pCodingSM = KN_NEW palCCodingStateMachine(m_pSMModel);
	Reset(); 
}

palCUTF8Prober::~palCUTF8Prober()
{
	KN_MFREE(m_pSMModel->classTable);
	KN_MFREE(m_pSMModel->stateTable);
	KN_MFREE(m_pSMModel);
	KN_SAFE_DEL(m_pCodingSM);
}

BRVOID palCUTF8Prober::Reset()
{
	m_pCodingSM->Reset(); 
	m_nNumOfMBChar = 0;
	m_eState = ePS_DETECTING;
}

EProbingState palCUTF8Prober::HandleData(BRCSTR a_pBuf, BRUINT32 a_nLen, IN BRBOOL a_bTestOnly /*= BRFALSE*/)
{
	ESM_State codingState;

	BRCHAR * pNewBuf = BRNULL;
	BRINT nNewBufLen =0;
	palCDetectorUtility::FilterWithoutEnglishLetters(a_pBuf, a_nLen, &pNewBuf, &nNewBufLen);

	// memory allocation error ӽ ó.
	if (pNewBuf == BRNULL)
		return m_eState;

	for (BRUINT32 i = 0; i < nNewBufLen; i++)
	{
		codingState = m_pCodingSM->NextState(pNewBuf[i]);
		if (codingState == eSS_Error)
		{
			m_eState = ePS_NOTME;
			break;
		}
		if (codingState == eSS_ItsMe)
		{
			m_eState = ePS_FOUNDIT;
			break;
		}
		if (codingState == eSS_Start)
		{
			if (m_pCodingSM->GetCurrentCharLen() >= 2)
				m_nNumOfMBChar++;
		}
	}

	if (m_eState == ePS_DETECTING)
		if ((a_bTestOnly && m_nNumOfMBChar) || GetConfidence() > SHORTCUT_THRESHOLD)
			m_eState = ePS_FOUNDIT;

	KN_MFREE(pNewBuf);

	return m_eState;
}

BRBOOL palCUTF8Prober::TestData(IN BRCSTR a_pBuf, IN BRUINT32 a_nLen)
{
	EProbingState eResult = HandleData(a_pBuf, a_nLen, BRTRUE);

	if (eResult == ePS_FOUNDIT)
		return BRTRUE;

	return BRFALSE;
}

#define ONE_CHAR_PROB   (BRFLOAT)0.30

BRFLOAT palCUTF8Prober::GetConfidence()
{
	BRFLOAT unlike = (BRFLOAT)0.99;

	if (m_nNumOfMBChar < 6)
	{
		for(BRUINT32 i = 0; i < m_nNumOfMBChar; i++)
			unlike *= ONE_CHAR_PROB;
		return (BRFLOAT)1.0 - unlike;
	}
	else
		return (BRFLOAT)0.99;
}