Click here to Skip to main content
15,884,298 members
Articles / Programming Languages / C#

hOOt - full text search engine

Rate me:
Please Sign up or sign in to vote.
4.92/5 (156 votes)
24 Feb 2019CPOL17 min read 1.1M   22.5K   388  
Smallest full text search engine (lucene replacement) built from scratch using inverted MGRB bitmap index, highly compact storage, operating in database and document modes
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.IO;
using RaptorDB.Common;

namespace RaptorDB
{
    #region [ internal classes ]
    internal struct PageInfo  // FEATURE : change back to class for count access for query caching
    {
        public PageInfo(int pagenum, int uniquecount, int duplicatecount)
        {
            PageNumber = pagenum;
            UniqueCount = uniquecount;
        }
        public int PageNumber;
        public int UniqueCount;
    }

    internal struct KeyInfo
    {
        public KeyInfo(int recnum)
        {
            RecordNumber = recnum;
            DuplicateBitmapNumber = -1;
        }
        public KeyInfo(int recnum, int bitmaprec)
        {
            RecordNumber = recnum;
            DuplicateBitmapNumber = bitmaprec;
        }
        public int RecordNumber;
        public int DuplicateBitmapNumber;
    }

    internal class Page<T>
    {
        public Page() // kludge so the compiler doesn't complain
        {
            DiskPageNumber = -1;
            RightPageNumber = -1;
            tree = new SafeDictionary<T, KeyInfo>(Global.PageItemCount);
            isDirty = false;
            FirstKey = default(T);
        }
        public int DiskPageNumber;
        public int RightPageNumber;
        public T FirstKey;
        public bool isDirty;
        public SafeDictionary<T, KeyInfo> tree;
    }


    internal class Statistics
    {
        public int PageCount = 0;
        public double TotalSplitTime = 0;
        public double FillFactor = 0;

        public override string ToString()
        {
            string s = "Page Count = " + PageCount + ", Total Split Time = " + TotalSplitTime;// +", Fill Factor = " + FillFactor;
            return s;
        }
    }
    #endregion

    internal class MGIndex<T> where T : IComparable<T>
    {
        ILog _log = LogManager.GetLogger(typeof(MGIndex<T>));
        private SafeSortedList<T, PageInfo> _pageList = new SafeSortedList<T, PageInfo>();
        private SafeDictionary<int, Page<T>> _cache = new SafeDictionary<int, Page<T>>();
        private List<int> _pageListDiskPages = new List<int>();
        private IndexFile<T> _index;
        private bool _AllowDuplicates = true;
        private int _LastIndexedRecordNumber = 0;
        private int _maxPageItems = 0;

        public MGIndex(string path, string filename, byte keysize, ushort maxcount, bool allowdups)
        {
            _AllowDuplicates = allowdups;
            _index = new IndexFile<T>(path + Path.DirectorySeparatorChar + filename, keysize, maxcount);
            _maxPageItems = maxcount;
            // load page list
            _index.GetPageList(_pageListDiskPages, _pageList, out _LastIndexedRecordNumber);
            if (_pageList.Count == 0)
            {
                Page<T> page = new Page<T>();
                page.FirstKey = (T)RDBDataType<T>.GetEmpty();
                page.DiskPageNumber = _index.GetNewPageNumber();
                page.isDirty = true;
                _pageList.Add(page.FirstKey, new PageInfo(page.DiskPageNumber, 0, 0));
                _cache.Add(page.DiskPageNumber, page);
            }
        }

        public int GetLastIndexedRecordNumber()
        {
            return _LastIndexedRecordNumber;
        }

        private object _setlock = new object();
        public void Set(T key, int val)
        {
            lock (_setlock)
            {
                PageInfo pi;
                Page<T> page = LoadPage(key, out pi);

                KeyInfo ki;
                if (page.tree.TryGetValue(key, out ki))
                {
                    // item exists
                    if (_AllowDuplicates)
                    {
                        SaveDuplicate(key, ref ki);
                        // set current record in the bitmap also
                        _index.SetBitmapDuplicate(ki.DuplicateBitmapNumber, val);
                    }
                    ki.RecordNumber = val;
                    page.tree[key] = ki; // structs need resetting
                }
                else
                {
                    // new item 
                    ki = new KeyInfo(val);
                    if (_AllowDuplicates)
                        SaveDuplicate(key, ref ki);
                    pi.UniqueCount++;
                    page.tree.Add(key, ki);
                }

                if (page.tree.Count > Global.PageItemCount)
                    SplitPage(page);

                _LastIndexedRecordNumber = val;
                page.isDirty = true;
            }
        }

        public bool Get(T key, out int val)
        {
            val = -1;
            PageInfo pi;
            Page<T> page = LoadPage(key, out pi);
            KeyInfo ki;
            bool ret = page.tree.TryGetValue(key, out ki);
            if (ret)
                val = ki.RecordNumber;
            return ret;
        }

        public void SaveIndex()
        {
            _log.Debug("Total split time (s) = " + _totalsplits);
            _log.Debug("Total pages = " + _pageList.Count);
            int[] keys = _cache.Keys();
            Array.Sort(keys);
            // save index to disk
            foreach (var i in keys)
            {
                var p = _cache[i];
                if (p.isDirty)
                {
                    _index.SavePage(p);
                    p.isDirty = false;
                }
            }
            _index.BitmapFlush();
        }

        public void Shutdown()
        {
            // save page list
            _index.SavePageList(_pageList, _pageListDiskPages);
            // shutdown
            _index.Shutdown();
        }

        public void FreeMemory()
        {
            // TODO : implement when needed
        }


        public IEnumerable<int> GetDuplicates(T key)
        {
            PageInfo pi;
            Page<T> page = LoadPage(key, out pi);
            KeyInfo ki;
            bool ret = page.tree.TryGetValue(key, out ki);
            if (ret)
                // get duplicates
                if (ki.DuplicateBitmapNumber != -1)
                    return _index.GetDuplicatesRecordNumbers(ki.DuplicateBitmapNumber);

            return new List<int>();
        }

        public void SaveLastRecordNumber(int recnum)
        {
            _index.SaveLastRecordNumber(recnum);
        }

        public bool RemoveKey(T key)
        {
            PageInfo pi;
            Page<T> page = LoadPage(key, out pi);
            bool b = page.tree.Remove(key);
            if (b)
            {
                pi.UniqueCount--;
                // FEATURE : decrease dup count
            }
            page.isDirty = true;
            return b;
        }

        //public Statistics GetStatistics()
        //{
        //    Statistics s = new Statistics();
        //    s.TotalSplitTime = _totalsplits;
        //    s.PageCount = _pageList.Count;

        //    return s;
        //}

        #region [  P R I V A T E  ]
        private double _totalsplits = 0;
        private void SplitPage(Page<T> page)
        {
            // split the page
            DateTime dt = FastDateTime.Now;

            Page<T> newpage = new Page<T>();
            newpage.DiskPageNumber = _index.GetNewPageNumber();
            newpage.RightPageNumber = page.RightPageNumber;
            newpage.isDirty = true;
            page.RightPageNumber = newpage.DiskPageNumber;
            // get and sort keys
            T[] keys = page.tree.Keys();
            Array.Sort<T>(keys);
            // copy data to new 
            for (int i = keys.Length / 2; i < keys.Length; i++)
            {
                newpage.tree.Add(keys[i], page.tree[keys[i]]);
                // remove from old page
                page.tree.Remove(keys[i]);
            }
            // set the first key
            newpage.FirstKey = keys[keys.Length / 2];
            // set the first key refs
            _pageList.Remove(page.FirstKey);
            _pageList.Remove(keys[0]);
            // dup counts
            _pageList.Add(keys[0], new PageInfo(page.DiskPageNumber, page.tree.Count, 0));
            page.FirstKey = keys[0];
            // FEATURE : dup counts
            _pageList.Add(newpage.FirstKey, new PageInfo(newpage.DiskPageNumber, newpage.tree.Count, 0));
            _cache.Add(newpage.DiskPageNumber, newpage);

            _totalsplits += FastDateTime.Now.Subtract(dt).TotalSeconds;
        }

        private Page<T> LoadPage(T key, out PageInfo pageinfo)
        {
            int pagenum = -1;
            // find page in list of pages

            bool found = false;
            int pos = 0;
            if (key != null)
                pos = FindPageOrLowerPosition(key, ref found);
            pageinfo = _pageList.GetValue(pos);
            pagenum = pageinfo.PageNumber;

            Page<T> page;
            if (_cache.TryGetValue(pagenum, out page) == false)
            {
                //load page from disk
                page = _index.LoadPageFromPageNumber(pagenum);
                _cache.Add(pagenum, page);
            }
            return page;
        }

        private Page<T> LoadPage(int pagenum)
        {
            Page<T> page;
            if (_cache.TryGetValue(pagenum, out page) == false)
            {
                //load page from disk
                page = _index.LoadPageFromPageNumber(pagenum);
                _cache.Add(pagenum, page);
            }
            return page;
        }

        private void SaveDuplicate(T key, ref KeyInfo ki)
        {
            if (ki.DuplicateBitmapNumber == -1)
                ki.DuplicateBitmapNumber = _index.GetBitmapDuplaicateFreeRecordNumber();

            _index.SetBitmapDuplicate(ki.DuplicateBitmapNumber, ki.RecordNumber);
        }

        private int FindPageOrLowerPosition(T key, ref bool found)
        {
            if (_pageList.Count == 0)
                return 0;
            // binary search
            int lastlower = 0;
            int first = 0;
            int last = _pageList.Count - 1;
            int mid = 0;
            while (first <= last)
            {
                mid = (first + last) >> 1;
                T k = _pageList.GetKey(mid);
                int compare = k.CompareTo(key);
                if (compare < 0)
                {
                    lastlower = mid;
                    first = mid + 1;
                }
                if (compare == 0)
                {
                    found = true;
                    return mid;
                }
                if (compare > 0)
                {
                    last = mid - 1;
                }
            }

            return lastlower;
        }
        #endregion
    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Architect -
United Kingdom United Kingdom
Mehdi first started programming when he was 8 on BBC+128k machine in 6512 processor language, after various hardware and software changes he eventually came across .net and c# which he has been using since v1.0.
He is formally educated as a system analyst Industrial engineer, but his programming passion continues.

* Mehdi is the 5th person to get 6 out of 7 Platinum's on Code-Project (13th Jan'12)
* Mehdi is the 3rd person to get 7 out of 7 Platinum's on Code-Project (26th Aug'16)

Comments and Discussions