/***************************************************************************
[Copy Right Info]
File Name: parser.cpp
Description:
Implementation of text parser
Compile to: parser.dll
***************************************************************************/
#include "StdAfx.h"
#include "parser.h"
#include <string.h>
#include <tchar.h>
#define SITELANGUAGE LANGUAGE_MANDARIN_CHINESE
#define NUM_CATEGORY (11)
#define SITENAME "�����������������"
#define SITEURL ("http://www.chinanews.com.cn")
#define LANDMARK_TITLEURL "n/2004"
#define LANDMARK_TITLE "=_blank>"
CATEGORY g_category[NUM_CATEGORY]={
{"��������","http://www.chinanews.com.cn/scroll-news/1.shtml", NULL,0},
{"��������","http://www.chinanews.com.cn/china.shtml", NULL,0},
{"��������","http://www.chinanews.com.cn/world.shtml", NULL,0},
{"�������","http://www.chinanews.com.cn/society.shtml", NULL,0},
{"��������","http://www.chinanews.com.cn/economic.shtml", NULL,0},
{"��������","http://www.chinanews.com.cn/sports.shtml", NULL,0},
{"��������","http://www.chinanews.com.cn/entertainment.shtml", NULL,0},
{"�ƽ�����","http://www.chinanews.com.cn/science.shtml", NULL,0},
{"��������","http://www.chinanews.com.cn/huaren.shtml", NULL,0},
{"�۰�����","http://www.chinanews.com.cn/compatriot.shtml", NULL,0},
{"̨������","http://www.chinanews.com.cn/taiwan.shtml", NULL,0}};
PARSERDESC g_parserDesc;
#define MAX_CATEGORY 11
#define MAX_SPAN 200
#define STRING_LABLE1 "http://www.chinanews.com.cn/n/"
#define STRING_LABLE2 "blank>"
#define STRING_LABLE3 ">>"
#define STRING_LABLE4 "<a href=\""
#define STRING_LABLE4_LEN 9 // _tcslen(STRING_LABLE4)
#define CAT1_ITEM_URL_LEN 55 // eg. http://www.chinanews.com.cn/n/2004-02-03/26/397496.html
#define STRING_LABLE1_LEN 30 // _tcslen(STRING_LABLE1)
#define STRING_LABLE2_LEN 6
#define CAT2_ITEM_URL_LEN 25 // CAT1_ITEM_URL_LEN - STRING_LABLE1_LEN =55-30
UINT g_uItem=0;
// Helper Function
UINT FindCategoryFromString(PTSTR psCategory)
{
PTSTR p1,p2;
for(UINT i=0;i<MAX_CATEGORY;i++)
{
p1=_tcsdup(psCategory);
p2=_tcsdup(g_category[i].psCategoryName);
p1[5]='\0'; // Compare the first two chinese characters.
p2[5]='\0';
if(_tcscmp(p1,p2)==0)
return i;
}
return -1;
}
INT ParseTitle(PTSTR psText, INT nCategory, TITLE* pTitle)
{
PTSTR pt=psText,p;
INT nItem=0,j=0,uCategory=-1;
TCHAR ps[1024]="";
// Find out how many items are there in psText.
while(pt=_tcsstr(pt,LANDMARK_TITLEURL))
{
nItem++;
pt++;
}
if(pTitle == NULL)
{
g_parserDesc.pCategory[nCategory].nTitle=nItem;
return nItem;
}
// Fill title data
pt=psText;
while(pt=_tcsstr(pt,LANDMARK_TITLEURL))
{
_tcscpy(ps, "http://www.chinanews.com.cn/");
_tcsncat(ps, pt, 28); // _tcslen("n/2004-02-04/26/397692.html")
pTitle->nCategory=nCategory;
pTitle->psContentUrl=_tcsdup(ps);
_tcscpy(ps, "");
pt=_tcsstr(pt,LANDMARK_TITLE);
pt+=_tcslen(LANDMARK_TITLE);
p=_tcsstr(pt, "</");
_tcsncat(ps, pt, p-pt);
pTitle->psTitle=_tcsdup(ps);
pTitle++;
}
return nItem;
}
INT ParseContent(PTSTR psText, INT nCategory, PTSTR psContent)
{
TCHAR text[20000];
PTSTR psCursor=text;
PTSTR p=psText,p1,p2;
PTSTR lable1="<p>";
PTSTR lable2="</p>";
_tcslwr(p);
while(p=_tcsstr(p,lable1))
{
p+=3; // Bypass the "<p>"
p1=_tcsstr(p, "<");
if(p1-p > 20)
{
_tcsncpy(psCursor, p, (size_t)(p1-p));
psCursor+=p1-p;
}
}
*psCursor='\0';
size_t size=(LONG)psCursor-(LONG)text+1;
if(psContent != NULL)
_tcsncpy(psContent, text, size);
return size;
}
PARSERDESC* GetParserDescriptor()
{
g_parserDesc.nCategory=NUM_CATEGORY;
g_parserDesc.pCategory=g_category;
g_parserDesc.pfnParseTitle=ParseTitle;
g_parserDesc.pfnParseContent=ParseContent;
g_parserDesc.dwLanguage=SITELANGUAGE;
g_parserDesc.psSiteName=SITENAME;
g_parserDesc.psSiteUrl=SITEURL;
return &g_parserDesc;
}