Click here to Skip to main content
15,896,606 members
Articles / Programming Languages / C++

A Lite-Memory Dictionary

Rate me:
Please Sign up or sign in to vote.
3.81/5 (9 votes)
10 Mar 20078 min read 66K   1.2K   21  
An article on the implementation of a dictionary where everything is kept in the disk as a B-Tree.
// Dictionary.h: interface for the CDictionary class.
//
//////////////////////////////////////////////////////////////////////

#if !defined(AFX_DICTIONARY_H__F17F3FC5_2376_4678_A66E_67E9D5C08438__INCLUDED_)
#define AFX_DICTIONARY_H__F17F3FC5_2376_4678_A66E_67E9D5C08438__INCLUDED_

#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000

#pragma warning (disable:4786)

#include <map>
#include <list>
#include <stack>
#include <vector>

using namespace std;

//typedefs
typedef multimap<int , CString, greater<int> >				MMAP_REVERSE_INT_STR;

#define		DICTIONARY_LITE_HEADER_SIZE							100
#define		DICTIONARY_LITE_DEFAULT_NO_OF_STR_IN_NODE			2	//5
#define		DICTIONARY_LITE_DEFAULT_NO_OF_STR_IN_TRANSITION		2	//6
#define		DICTIONARY_LITE_DEFAULT_STR_LEN						10	
#define		DICTIONARY_LITE_INVALID_OFFSET						-1	
#define		DICTIONARY_LITE_HEADER_INITIAL_CHAR_LEN				2
#define		DICTIONARY_LITE_HEADER_PARAM_PART_LENGTH			60
#define		DICTIONARY_LITE_HEADER_STAT_PART_LENGTH				38	

class CStrList : public list<CString>
{
public:
	CStrList() {}
	~CStrList() {}

	CStrList& operator += (CStrList& oStrList)
	{
		list<CString>::iterator ite1 = oStrList.begin();
		list<CString>::iterator iteEnd1 = oStrList.end();
		for( ; ite1 != iteEnd1; ite1++)
			push_back(*ite1);
		return *this;
	}

	CString GetConcatenatedString()
	{
		CString s = "";
		list<CString>::iterator ite1 = begin();
		list<CString>::iterator iteEnd1 = end();
		for( ; ite1 != iteEnd1; ite1++)
			s += *ite1;
		return s;
	}

	bool IsEmpty()	{return (size() == 0);}
	
};


class CDictionaryLite  
{
private:
	// dictionary parameters
	int i_StrLen;
	int i_NoOfStrInNode;
	int i_NoOfStrInTransition;
	int i_NoOfNodes;
	int i_NodeSize;
	int i_TransitionSize;
	int i_TransitionMilestoneSize;
	CString s_Filename;
	CFile m_File;
	bool b_KeepFile;
	int i_CurrNodeOffset;

	//dictionary statistics
	int i_DistinctStringCount;
	int i_CummulativeStringCount;

	//char buffer to hold the header
	char a_Header[DICTIONARY_LITE_HEADER_SIZE];

	//char buffer to hold current node 
	char* p_CurrNodeBuff;

	//char buffer to hold backup node in memory
	char* p_BackupNodeBuff;

	//char buffer to hold current transition in memory
	char* p_CurrTransitionBuff;

	//char buffer to hold current transition milestone in memory
	char* p_CurrTransitionMilestone;

	//root info
	bool b_IsRootFinal;
	map<CString, int> map_RootTransitions;

	//variables used when retrieving words
	stack<map<CString, int> > stk_TransitionMaps;
	stack<int> stk_NodeOffsets;
	stack<CString> stk_Str;

	//Initialize
	void Init();
	void InitFromFile();
	void Finalize();
	
	//methods to read from and write to file
	void ReadNode(int iOffset);
	void ReadTransition(int iOffset);
	void WriteNode(int iOffset);
	int AppendNode();
	void WriteTransition(int iOffset);
	int AppendTransition();
	void ReadHeader();
	void WriteHeader();
	void ReadTransitionMilestoneBlock(int iOffset);
	void WriteTransitionMilestoneBlock(int iOffset);
	int AppendTransitionMilestone();

	//Header manipulation
	void ExtractValuesFromHeader();
	void WriteValuesToHeader();

	//methods regarding the current node (which is in memory)
	bool IsCurrNodeFinal();
	int GetIDOfCurrNode();
	int GetCountOfCurrNode();
	int GetNextPartOffsetOfCurrentNode();
	int GetDestNodeFromCurrNode(CString str);
	void GetTransitionMilestonesFromCurrNode(map<CString, int>& mapTransitions);
	void GetAllTransitionsFromCurrentNode(map<CString, int>& mapTransitions);
	void SetCurrNodeAsFinal(bool bIsFinal = true);
	void SetIDOfCurrNode(int iID);
	void SetCountOfCurrNode(int iCount);
	void SetDestNodeFromCurrentNode(int iOffset);
	void SetTransitionMilestonesOfCurrNode(map<CString, int>& mapTransitions);
	void AddTransitionToCurrNode(CString str, int iOffset);		//iOffset is the offset of destination node
	void AddTransitionToTransitionMilestone(int iTMOffset, CString str, int iOffset);

	//methods regarding the current transition (which is in memory)
	void GetTransitionsFromCurrTransition(map<CString, int>& mapTransitions);
	void WriteTransitionsToCurrTransaction(map<CString, int>& mapTransitions);

	//methods regarding transition milestone blocks
	void AddStringToTransitionMilestoneBlock(int iOffset, CString str, int iOffsetVal);
	void AddStringToTransitionMilestoneBlock(char* pBlock, CString str, int iOffsetVal);
	int GetOffsetToNextFromTransitionMilestoneBlock();
	void SetOffsetToNextOfTransitionMilestoneBlock(int iOffset);
	void GetTransitionsFromCurrTransitionMilestonesBlock(map<CString, int>& mapTransitions);
	void SetTransitionsOfCurrTransitionMilestonesBlock(map<CString, int>& mapTransitions);

	//other utility functions
	void ReadStringIntArrayBlock(map<CString, int>& mapBlock, char* pStart, int iMaxNo);
	void WriteStringIntArrayBlock(map<CString, int>& mapBlock, char* pStart, int iMaxNo);
	int NewNode();
	void LoadRoot();
	int GetRootOffset();

	//utility functions for word retreival
	void FlushStacks();
	CStrList GetStringFromStack();

public:
	CDictionaryLite(CString sFilename);
	CDictionaryLite(CString sFilename, bool bKeepFile, int iStrLen = DICTIONARY_LITE_DEFAULT_STR_LEN, int iNoOfStrInNode = DICTIONARY_LITE_DEFAULT_NO_OF_STR_IN_NODE, int iNoOfStrInTransition = DICTIONARY_LITE_DEFAULT_NO_OF_STR_IN_TRANSITION);
	virtual ~CDictionaryLite();

	//access methods
	int GetStrLen();
	int GetNoOfStrInNode();
	int GetNoOfStrInTransition();
	int GetNodeCount();
	int GetDistinctStringCount();
	int GetCummulativeStringCount();
	void SetStrLen(int iLen);
	void SetNoOfStrInNode(int iNo);
	void SetNoOfStrInTransition(int iNo);

	//utility methods
	void AddString(CStrList& oStrList, int iCount = 1);
	int GetStringCount(CStrList& oStrList);
	bool GetFirstWord(CStrList& oStrList, int& iCount);
	bool GetNextWord(CStrList& oStrList, int& iCount);
	void WriteToTextFile(CString sFilename);
	void WriteToTextFileDescending(CString sFilename);

	//debugging
	void PrintNodes(CString sFilename);
	void AppendTreeToFile(CString sFilename, int iOffset);
};

#endif // !defined(AFX_DICTIONARY_H__F17F3FC5_2376_4678_A66E_67E9D5C08438__INCLUDED_)

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Web Developer
Sri Lanka Sri Lanka
I am a Sri Lankan Software Engineer with 3 years experience in software development. I have a C background and have experience in MFC. Iam very much interested in AI especially Natural Language Processing. I have had done some research in Sinhala which is my native language.
In my spare time I use to read novels and politics.

Comments and Discussions