Searcharoo 2007 (Medium Trust and Office 2007 indexing)

craigd

Rate me:

4.95/5 (11 votes)

28 Apr 2007CPOL10 min read

204.2K

849

Remove Binary Serialization to solve Medium Trust problem; index OpenXML document formats in ASP.NET/C# free search engine

searcharoo_5.zip
- EPocalipse.IFilter
  - bin
    - Release
      - EPocalipse.IFilter.dll
  - ComHelper.cs
  - EPocalipse.IFilter.csproj
  - FilterLoader.cs
  - FilterReader.cs
  - IFilter.cs
  - Properties
    - AssemblyInfo.cs
- Mono.GetOptions
  - bin
    - Release
      - Mono.GetOptions.dll
  - GetOptTest
    - GetOptTester.cs
  - Mono.GetOptions.csproj
  - Mono.GetOptions.Useful
    - CommonCompilerOptions.cs
  - Mono.GetOptions
  - Mono
  - Properties
    - AssemblyInfo.cs
- Searcharoo.Indexer
  - app.config
  - bin
    - Release
      - EPocalipse.IFilter.dll
      - Mono.GetOptions.dll
      - Searcharoo.dll
      - Searcharoo.Indexer.exe
      - Searcharoo.Indexer.exe.config
  - CommandLinePreferences.cs
  - Program.cs
  - Properties
    - AssemblyInfo.cs
  - Searcharoo.Indexer.csproj
- Searcharoo.sln
- Searcharoo
  - AllDiagram.cd
  - bin
    - Release
      - EPocalipse.IFilter.dll
      - Searcharoo.dll
  - Common
    - Catalog.cs
    - CatalogBinder.cs
    - CatalogWordFile.cs
    - File.cs
    - GoWords
      - GoWords.cs
      - IGoWord.cs
    - Kelvin.cs
    - Preferences.cs
    - ResultFile.cs
    - Stemming
    - StopWords
      - IStopper.cs
      - StopWords.cs
    - Word.cs
  - Core_CatalogWordFile.cd
  - DocumentDiagram.cd
  - Engine
    - Search.cs
  - Indexer
    - Documents
    - ProgressEventArgs.cs
    - RobotsTxt.cs
    - Spider.cs
    - Zip
      - Crc32.cs
      - Zip.cs
  - IndexerDiagram.cd
  - Properties
    - AssemblyInfo.cs
  - Searcharoo.csproj
  - SearchEngineDiagram.cd
  - SpiderDiagram.cd
- WebApplication
  - App_Code
  - bin
    - EPocalipse.IFilter.dll
    - Searcharoo.dll
    - WebApplication.dll
  - default.aspx
  - Properties
    - AssemblyInfo.cs
  - robots.txt
  - Search.aspx
  - SearchControl.ascx
  - SearchSpider.aspx
  - Web.config
  - WebApplication.csproj

using System;
using System.IO;
using System.Xml;
using ionic.utils.zip;

namespace Searcharoo.Common
{
    /// <summary>
    /// Load a Microsoft Excel 2007 Xml file format
    /// </summary>
    /// <remarks>
    /// <see cref="DocxDocument" />
    /// 
    /// Xlsx...
    /// http://www.gemboxsoftware.com/Excel2007/DemoApp.htm
    /// </remarks>
    public class XlsxDocument : DownloadDocument
    {
        private string _WordsOnly;

        public XlsxDocument(Uri location)
            : base(location)
        { }

        public override void Parse()
        {
            // no parsing (for now). perhaps in future we can regex look for urls (www.xxx.com) and try to link to them...
        }

        public override string WordsOnly
        {
            get { return _WordsOnly; }
        }

        /// <remarks>
        /// .NET System.IO.Compression and zip files
        /// http://blogs.msdn.com/dotnetinterop/archive/2006/04/05/.NET-System.IO.Compression-and-zip-files.aspx
        /// </remarks>
        public override bool GetResponse(System.Net.HttpWebResponse webresponse)
        {
            string filename = System.IO.Path.Combine(
                          Preferences.DownloadedTempFilePath
                        , (System.IO.Path.GetFileName(this.Uri.LocalPath)));
            this.Title = System.IO.Path.GetFileNameWithoutExtension(filename);

            SaveDownloadedFile(webresponse, filename);
            try
            {   // Will be accessing this data in the xlsx file
                //  xl/workbook.xml              sheet
                //  xl/worksheets/sheet{0}.xml   v
                try
                {
                    using (ZipFile zip = ZipFile.Read(filename))
                    {
                        int slideCount = 0;
                        using (MemoryStream streamroot = new MemoryStream())
                        {   // open the presentation 'root' file to see how many slides there are
                            zip.Extract("xl/workbook.xml", streamroot);
                            streamroot.Seek(0, SeekOrigin.Begin);
                            XmlDocument xmldocroot = new XmlDocument();
                            xmldocroot.Load(streamroot);
                            XmlNodeList objXML = xmldocroot.GetElementsByTagName("sheet");
                            slideCount = objXML.Count;
                        }
                        XmlDocument xmlSheet;
                        string entryToExtractPattern = @"xl/worksheets/sheet{0}.xml";
                        for (int slideId = 1; slideId <= slideCount; slideId++)
                        {   // now open each slide file to extract text
                            using (MemoryStream stream = new MemoryStream())
                            {
                                string entryToExtract = String.Format(entryToExtractPattern, slideId);
                                zip.Extract(entryToExtract, stream);
                                stream.Seek(0, SeekOrigin.Begin);
                                xmlSheet = new XmlDocument();
                                xmlSheet.Load(stream);
                            }
                            string slideWords = "";
                            foreach (XmlElement x in xmlSheet.GetElementsByTagName("v"))
                            {
                                slideWords = slideWords + " " + x.InnerText;
                            }
                            _WordsOnly = _WordsOnly + " " + slideWords + Environment.NewLine + Environment.NewLine;
                            this.All = _WordsOnly;
                        }
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.Message);
                }
                System.IO.File.Delete(filename);    // clean up
            }
            catch (Exception ex2)
            {
                //                ProgressEvent(this, new ProgressEventArgs(2, "IFilter failed on " + this.Uri + " " + e.Message + ""));
            }
            if (this.All != string.Empty)
            {
                this.Description = base.GetDescriptionFromWordsOnly(WordsOnly);
                return true;
            }
            else
            {
                return false;
            }
        }


    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

Written By

craigd

Web Developer

Australia

-- ooo ---
www.conceptdevelopment.net
conceptdev.blogspot.com
www.searcharoo.net
www.recipenow.net
www.racereplay.net
www.silverlightearth.com

Searcharoo 2007 (Medium Trust and Office 2007 indexing)

License

Comments and Discussions