【素朴贝叶斯】实战朴素贝叶斯_代码实现_训练算法

【朴素贝叶斯】实战朴素贝叶斯_代码实现_训练算法

说一下Train函数的实现。

在上文中，我提到过，朴素贝叶斯也有两种模型：贝努力模型和多项式模型。小弟第一次实现朴素贝叶斯，就老老实实按照基本原理做了一个贝努力模型；多项式模型也不难，变通一下就行。不废话了，直接上代码了，有点长，不过很容易看懂：

bool NaiveBayes::Train (const char * sFileSample, int iClassNum, int iFeaTypeNum, string & sSegmenter, int iFeaExtractNum, const char * sFileModel, bool bCompactModel){// 防御性代码if (iClassNum &lt;= 0 || iFeaTypeNum &lt;= 0 || iFeaExtractNum &lt;= 0)return false;ifstream in (sFileSample, ios_base::binary);ofstream out (sFileModel);if (!in || !out){cerr &lt;&lt; "Can not open the file" &lt;&lt; endl;return false;}// 这些都是临时数据结构，用来临时存储模型参数，特征选择需要的参数等等// 1. the temp data structure for model parameters// 1.1 the total number of document in training samplesint iTotalDocNum = 0;// 1.2 the prior probability of class, temparaly it store the doc number in this classdouble * pClassPriorProb = new double [iClassNum];memset (pClassPriorProb, 0, iClassNum*sizeof(double));// 1.3 the prior probability of feature type, temparaly it stores the doc number in this feature （这个主要用于特征选择，bayes模型本身并不需要这个参数）double * pFeaItemPriorProb = new double [iFeaTypeNum];memset (pFeaItemPriorProb, 0, iFeaTypeNum*sizeof(double));// 1.4 the chi-square value that feature falls into class, temparaly it stores the doc number for this class and feature （可以看到，特征选择算法主要用卡方选择）double ** ppChiMatrix = new double * [iClassNum];for (int i=0; i&lt;iClassNum; i++){ppChiMatrix[i] = new double [iFeaTypeNum];memset (ppChiMatrix[i], 0, iFeaTypeNum*sizeof(double));}// 1.5 the post-probability for class and featuredouble ** ppPProbMatrix = new double * [iClassNum];for (int i=0; i&lt;iClassNum; i++){ppPProbMatrix[i] = new double [iFeaTypeNum];memset (ppChiMatrix[i], 0, iFeaTypeNum*sizeof(double));}// 1.6 for the feature selection （表示哪些特征被选中了）int * pFeaSelected = new int [iFeaTypeNum];memset (pFeaSelected, 0, iFeaTypeNum*sizeof(int));// 2. iterate the training samples and fill count into the temp data structurestring sLine;int i = 0;while (getline (in, sLine)){// show some information on screenif (0 == i%10000)cout &lt;&lt; i &lt;&lt; "\n";i++;// 2.1 the total number of dociTotalDocNum++;// 2.2 split the sample into class and feature itemsstring::size_type iSeg = sLine.find_first_of (sSegmenter);string sTmp = sLine.substr (0, iSeg);int iClassId = atoi (sTmp.c_str());if (iClassId &gt;= iClassNum)continue;pClassPriorProb [iClassId]++;// 2.3 count the rest feature itemsiSeg += sTmp.length();sTmp = sLine.substr (iSeg);istringstream isLine (sTmp);string sTmpItem;while (isLine &gt;&gt; sTmpItem){int iFeaItemId = atoi (sTmpItem.c_str());if (iFeaItemId &gt;= iFeaTypeNum)continue;// add the countpFeaItemPriorProb [iFeaItemId]++;ppChiMatrix [iClassId][iFeaItemId]++;}}// 3. calculate the model parameters // 3.1 the chi-square value as well as the post-probabiltyfor (int i=0; i&lt;iClassNum; i++){for (int j=0; j&lt;iFeaTypeNum; j++){double dA = ppChiMatrix[i][j];double dB = pFeaItemPriorProb[j] - dA; // currently pFeaItemPriorProb[i] == sum_i (ppChiMatrix[i][j])double dC = pClassPriorProb [i] - dA;  // currently pClassPriorProb[i] == sum_j (ppChiMatrix[i][j])double dD = (double)iTotalDocNum - dA - dB - dC;// the chi value double dNumerator = dA * dD;dNumerator -= dB * dC;dNumerator = pow (dNumerator, 2.0);double dDenominator = dA + dB;dDenominator *= (dC + dD);dDenominator += DBL_MIN; // for smoothingppChiMatrix[i][j] = dNumerator / dDenominator;// the post-probability: p(feature|class)ppPProbMatrix[i][j] = dA / pClassPriorProb [i];}}// 3.2 the prior probability of classfor (int i=0; i&lt;iClassNum; i++)pClassPriorProb [i] /= iTotalDocNum;// 3.3 the prior probability of featurefor (int i=0; i&lt;iFeaTypeNum; i++)pFeaItemPriorProb [i] /= iTotalDocNum;// 4. feature selection （这个函数下一篇文章再详细讲）FeaSelByChiSquare (ppChiMatrix, ppPProbMatrix, iClassNum, iFeaTypeNum, iFeaExtractNum, pFeaSelected);// 5. dump the model into txt fileif (bCompactModel)// output the parameters only for predicting{// 5.1 the prior probability of classout &lt;&lt; iClassNum &lt;&lt; endl;for (int i=0; i&lt;iClassNum; i++){out &lt;&lt; pClassPriorProb [i] &lt;&lt; "\n";}// 5.2 the actual selected feature type numberint iActualFeaNum = 0;for (int j=0; j&lt;iFeaTypeNum; j++){if (1 == pFeaSelected[j])iActualFeaNum ++;}out &lt;&lt; iActualFeaNum &lt;&lt; endl;// 5.3 the post probabilityfor (int i=0; i&lt;iClassNum; i++){for (int j=0; j&lt;iFeaTypeNum; j++){if (1 == pFeaSelected[j]){out &lt;&lt; j &lt;&lt; ":" &lt;&lt; ppPProbMatrix[i][j] &lt;&lt; "\n";}}}}else// output the full information{// 5.1 the total number of documentout &lt;&lt; iTotalDocNum &lt;&lt; endl;// 5.2 the prior probability of class out &lt;&lt; iClassNum &lt;&lt; endl;for (int i=0; i&lt;iClassNum; i++)// classindex:priorprob{out &lt;&lt; i &lt;&lt; ":" &lt;&lt; pClassPriorProb [i] &lt;&lt; "\n";}// 5.3 the prior probability of feature type: this is NO used in bayes model, record this for more info//     and whether this feature is selected or not by any classout &lt;&lt; iFeaTypeNum &lt;&lt; "\n";for (int i=0; i&lt;iFeaTypeNum; i++)// featureId:priorprob:selected or not{out &lt;&lt; i &lt;&lt; ":" &lt;&lt; pFeaItemPriorProb[i] &lt;&lt; ":" &lt;&lt; pFeaSelected &lt;&lt; "\n";}// 5.4 the chi-square value for class-feature pairfor (int i=0; i&lt;iClassNum; i++){for (int j=0; j&lt;iFeaTypeNum; j++){out &lt;&lt; ppChiMatrix[i][j] &lt;&lt; "\n";}}// 5.5 the post probability for (int i=0; i&lt;iClassNum; i++){for (int j=0; j&lt;iFeaTypeNum; j++){out &lt;&lt; ppPProbMatrix[i][j] &lt;&lt; "\n";}} }// last, release the memorydelete [] pClassPriorProb;delete [] pFeaItemPriorProb;for (int i=0; i&lt;iClassNum; i++){delete [] ppChiMatrix[i];}delete [] ppChiMatrix;for (int i=0; i&lt;iClassNum; i++){delete [] ppPProbMatrix[i];}delete [] ppPProbMatrix;delete [] pFeaSelected;return true;}

在这个函数的主要功能就是统计，一方面统计后验概率，另一方面统计卡方特征选择所需要的参数，他们分别存储在ppChiMaxtrix和ppProbMatrix。这两个都是二维数组，数组的维度由类别数目和特征类型总数决定。数据结构的设计风格都是c的风格，自己搭建数据，并负责释放；并没有用c++ stl中的复杂数据结构，如：set、map等。中间经过特征选择，最后模型存储在文本文件当中。

接下来，讲特征选择算法。

【素朴贝叶斯】实战朴素贝叶斯_代码实

热点推荐