Click here to Skip to main content
Click here to Skip to main content

Moving from Old Sharepoint Server? Something to Import inside SharePoint 2013?

, 28 Jun 2014
Rate this:
Please Sign up or sign in to vote.
Moving from old SharePoint server? Something to import inside SharePoint 2013?

Introduction

This article describes a methodology to move data from old systems to SharePoint 2013, fast, free in the safest way.
You can also use this technique to move frontpage, static HTML, PHP, etc. to SharePoint 2013.

Prerequisites

Internet Explorer knowledge, C#, SharePoint 2013

Using the Code

Let's start with the classic Program.cs:

namespace SharePoint.Import.SpiderAgent {
    static class Program {
        /// <summary>
        /// The main entry point for the application.
        /// </summary>
        [STAThread]
        static void Main() {
            Application.EnableVisualStyles();
            Application.SetCompatibleTextRenderingDefault(false);
            Application.Run(new SpiderAgent());
        }
    }
}

Now we need something to browse old Application Server, we are going to envelop "WebProxy" class to avoid company security (in this way, you can bypass "legally" all security request):

namespace SharePoint.Import.SpiderAgent
{
    public class PersonalWebClient: WebClient
    {
        /// <summary>
        /// Initializes a new instance of the <see cref="PersonalWebClient"/> class.
        /// </summary>
        /// <param name="proxyUser">The proxy user.</param>
        /// <param name="proxyPassword">The proxy password.</param>
        public PersonalWebClient(string proxyUser, string proxyPassword)
        {
            try
            {
                this.UseDefaultCredentials = true;

                //WebClient object
                if (!string.IsNullOrEmpty(proxyUser))
                    this.Proxy = setProxy(proxyUser, proxyPassword);
                //this.Credentials = new NetworkCredential(proxyUser, proxyPassword);
            }
            catch { }
        }

        /// <summary>
        /// Sets the proxy.
        /// </summary>
        /// <param name="proxyUser">The proxy user.</param>
        /// <param name="proxyPassword">The proxy password.</param>
        /// <returns></returns>
        static public WebProxy setProxy(string proxyUser, string proxyPassword)
        {
            //string proxyUser = "Windows Domain\\user";
            //string proxyPassword = "user password";
            string proxyDomain = "intranet proxy:8080";

            WebProxy p = new WebProxy(proxyDomain, true);
            p.Credentials = new System.Net.NetworkCredential(proxyUser, proxyPassword);

            return p;
        }
    }
}

Now we need HtmlParse a cool one Big Grin | :-D

This code is an extract of spider, just to give an idea of algorithm:

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.IO;
using System.Security.Cryptography;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using mshtml;
using System.Net;
using HtmlAgilityPack;
using System.Web;
using System.Xml.Linq;
using System.Xml;
using System.Net.Mime;
using System.Windows.Forms;
using System.Threading;
using System.Diagnostics;

namespace SharePoint.Import.SpiderAgent
{
    /// <summary>
    /// The Parser class
    /// </summary>
    public class HtmlParser
    {
        static SortedList<string, Uri> md5VisitedPages;
        // omiss.....

        /// <summary>
        /// Removes the special characters.
        /// </summary>
        /// <param name="input">The input.</param>
        /// <returns></returns


// a nice idea to remove noise

        public static string RemoveSpecialCharacters(string input)
        {
            //input = FileOrFolder.cleaner(input);
            //Regex r = new Regex("(?-|?:[^a-z0-9 ]|(?<=['\"])s)", 
            RegexOptions.IgnoreCase | RegexOptions.CultureInvariant | RegexOptions.Compiled);
            //string result = r.Replace(input, String.Empty).Replace(Environment.NewLine, string.Empty);

            input = Regex.Replace(input, @"<[^>]*>", String.Empty);
            input = input.Replace(Environment.NewLine, " ").Replace('\t', ' ').Replace
            (@"&nbsp", "").Replace(';', ' ').Trim();

            RegexOptions options = RegexOptions.None;
            Regex regex = new Regex(@"[ ]{2,}", options);
            input = regex.Replace(input, @" ");

            return input;
        }

        /// <summary>
        /// Processes the HTML body.
        /// </summary>
        /// <param name="htmlDoc">The HTML document.</param>
        /// <param name="url">The URL.</param>
        /// <param name="lt">The lt.</param>
        /// <param name="looker">The looker.</param>
        private void processHTMLBody(HtmlAgilityPack.HtmlDocument htmlDoc, Uri url, LockerType lt, string looker)
        {
            int pushed = 0;
            try
            {
                foreach (HtmlNode link in htmlDoc.DocumentNode.SelectNodes("//a[@href]"))
                {
                    HtmlAttribute att = link.Attributes["href"];
                    string sUrl = att.Value;

                    string baseUrl = url.ToString();
                    if (type == "html" || type == "htm")
                    {
                        string str = baseUrl.Substring(0, baseUrl.ToLower().LastIndexOf('/') + 1) + sUrl;
                        FileOrFolder fof = new FileOrFolder(FileOrFolderType.Folder);
                        fof.title = RemoveSpecialCharacters(att.OwnerNode.InnerText).Trim();
                        if (String.IsNullOrEmpty(fof.title))
                            continue;
                        stack.Peek().Children.Add(fof);
                        stack.Push(fof);
                        fof.sourceUrl = url.ToString();
                        retrieveHTML(str, LockerType.Leaves, "*");
                        continue;
                    }

                    FileOrFolder fs = new FileOrFolder(FileOrFolderType.File);
                    fs.sourceUrl = baseUrl.Substring(0, baseUrl.LastIndexOf('/') + 1) + sUrl;
                    string fileTarget = prefixFilePath + "\\";
                    string strs = sUrl.Replace('/', '\\');
                    string fileName = strs.Substring(strs.LastIndexOf('\\') + 1);
                    fileTarget += fileName;
                    fs.fileName = fileName;
                    notify("Downloading " + fs.sourceUrl);
                    try
                    {
                        fs.title = RemoveSpecialCharacters(att.OwnerNode.InnerText).Trim();
                        string str = downloadFileAvoidDuplicates(fs, fileTarget);
                        if (string.IsNullOrEmpty(str))
                            continue;
                        fs.fileName = str;
                        stack.Peek().Children.Add(fs);
                    }
                    catch (Exception exe)
                    {
                        notify(exe, url.ToString());
                    }
                }
                for (int i = pushed; i > 0; i--)
                    stack.Pop();
            }
            catch (Exception exe)
            {
                try
                {
                    notify(exe, url.ToString());
                }
                catch { }
            }
        }

        /// <summary>
        /// Processes the web.
        /// </summary>
        /// <param name="htmlDoc">The HTML document.</param>
        /// <param name="url">The URL.</param>
        private void processWeb(HtmlAgilityPack.HtmlDocument htmlDoc, Uri url)
        {
            const string sResult = "/url?q=";
            try
            {
                foreach (HtmlNode link in htmlDoc.DocumentNode.SelectNodes("//a[@href]"))
                {
                    HtmlAttribute att = link.Attributes["href"];
                    string sUrl = att.Value;

                    if (!sUrl.StartsWith(sResult))
                        continue;
                    sUrl = sUrl.Substring(sResult.Length);
                    sUrl = sUrl.Substring(0, sUrl.IndexOf('&'));

                    if (bGoogle && (sUrl.ToLower().Contains("webcache") 
                    || sUrl.ToLower().Contains(@"q=related")))
                        continue;

                    FileOrFolder fof = new FileOrFolder(FileOrFolderType.Folder);
                    fof.title = RemoveSpecialCharacters(att.OwnerNode.InnerText).Trim();
                    if (String.IsNullOrEmpty(fof.title))
                        continue;
                    stack.Peek().Children.Add(fof);
                    stack.Push(fof);
                    fof.sourceUrl = url.ToString();
                    notify("Navigating " + sUrl);
                    retrieveHTML(sUrl, LockerType.Words, null);
                    stack.Pop();
                }
            }
            catch (Exception exe)
            {
                try
                {
                    notify(exe, url.ToString());
                }
                catch { }
            }
        }

        /// <summary>
        /// Processes the HTML class.
        /// </summary>
        /// <param name="htmlDoc">The HTML document.</param>
        /// <param name="url">The URL.</param>
        /// <param name="lt">The lt.</param>
        /// <param name="looker">The looker.</param>
        private void processHTMLClass(HtmlAgilityPack.HtmlDocument htmlDoc, Uri url, LockerType lt, string looker)
        {
            try
            {
                var nomedocumento = from foo in htmlDoc.DocumentNode.SelectNodes(looker) select foo;
                foreach (var nodes in nomedocumento)
                {
                    foreach (var childNode in nodes.ChildNodes)
                    {
                        string sValue = childNode.InnerText;
                        try
                        {
                            string link = childNode.InnerHtml;
                            link = link.Substring(9);
                            string tagType = string.Empty;
                            if (link.Contains("pdf.png") || link.Contains("pdf.gif"))
                                tagType = "pdf";
                            else if (link.Contains("link.png") || 
                            link.Contains("link.gif") || link.Contains("folder.gif"))
                                tagType = "link";
                            else if (link.Contains("txt.png") || link.Contains("txt.gif"))
                                tagType = "txt";
                            int pos = link.IndexOf(">");
                            link = link.Substring(0, pos - 1);
                            if (link.EndsWith("\" target=\"_blank"))
                            {
                                pos = link.LastIndexOf("\" target=\"_blank");
                                link = link.Substring(0, pos); { Uri a = new Uri(link, UriKind.Absolute); }
                            }
                            else { Uri a = new Uri(link, UriKind.Relative); }
                            notify("Parsing: " + sValue + " " + tagType + " " + link);
                            documents.Add(sValue);
                            documents.Add(tagType);
                            documents.Add(link);
                        }
                        catch
                        {
                            documents.Add(sValue);
                        }
                    }
                    notify("Adding: " + nodes.ParentNode.ChildNodes[1].InnerText);
                    documents.Add(nodes.ParentNode.ChildNodes[1].InnerText);
                }
            }
            catch (Exception exe)
            {
                try
                {
                    notify(exe, url.ToString());
                }
                catch { }
            }
        }

        /// <summary>
        /// Processes the HTML class.
        /// </summary>
        /// <param name="htmlDoc">The HTML document.</param>
        /// <param name="url">The URL.</param>
        private void processLeaves(HtmlAgilityPack.HtmlDocument htmlDoc, Uri url)
        {
            //if(System.Diagnostics.Debugger.IsAttached)
            //return;
            FileOrFolder f = new FileOrFolder(FileOrFolderType.Folder);
            string[] prs = url.ToString().Split('/');

            f.sourceUrl = url.ToString();
            f.fileName = prs[prs.Length - 2];
            stack.Peek().Children.Add(f);
            stack.Push(f);

            try
            {
                FileOrFolder folder = stack.Peek();

                foreach (HtmlNode link in htmlDoc.DocumentNode.SelectNodes("//a[@href]"))
                {
                    HtmlAttribute att = link.Attributes["href"];
                    string sUrl = url.ToString();

                    if (att.Value.ToLower().EndsWith("leaft.html"))
                        continue;

                    FileOrFolder fs = new FileOrFolder(FileOrFolderType.File);
                    fs.bLeaf = true;
                    fs.sourceUrl = sUrl.Substring(0, sUrl.ToLower().LastIndexOf("leaft.html")) + att.Value;
                    notify("Downloading " + fs.sourceUrl);
                    try
                    {
                        string str = downloadFile(fs.sourceUrl, prefixFilePath + "\\" + att.Value);
                        if (string.IsNullOrEmpty(str))
                            continue;
                        fs.fileName = str;
                        folder.Children.Add(fs);
                    }
                    catch (Exception exe)
                    {
                        notify(exe, fs.sourceUrl);
                    }
                }
            }
            catch (Exception exe)
            {
                try
                {
                    notify(exe, url.ToString());
                }
                catch { }
            }
            stack.Pop();
        }

        /// <summary>
        /// Processes the HTML table.
        /// </summary>
        /// <param name="htmlDoc">The HTML document.</param>
        /// <param name="url">The URL.</param>
        /// <param name="lt">The lt.</param>
        /// <param name="looker">The looker.</param>
        private void processHTMLTable(HtmlAgilityPack.HtmlDocument htmlDoc, Uri url, LockerType lt, string looker)
        {
            try
            {
                string selector = "//table[@id='" + looker + "']";
                var tableA = from table in htmlDoc.DocumentNode.SelectNodes(selector).Cast<HtmlNode>()
                             from row in table.SelectNodes("tr").Cast<HtmlNode>()
                             from cell in row.SelectNodes("th|td").Cast<HtmlNode>()
                             select cell;

                foreach (var childNode in tableA)
                {
                    string sValue = childNode.OuterHtml;
                    if (sValue.Contains("../immagini/pdf.gif"))
                    {
                    }
                }
            }
            catch (Exception exe)
            {
                try
                {
                    notify(exe, url.ToString());
                }
                catch { }
            }
        }

        /// <summary>
        /// Extracts the attribute.
        /// </summary>
        /// <param name="element">The element.</param>
        /// <returns></returns>
        string extractAttribute(HtmlAgilityPack.HtmlNode element)
        {
            string link = string.Empty;
            try
            {
                link = element.GetAttributeValue("href", null);
                link = link.ToLower().Trim();
                if (link.EndsWith(".css") ||
                    link.Contains("googleapis.com") ||
                    link == @"#"
                    )
                {
                    link = string.Empty;
                }
            }
            catch (Exception exe)
            {
                try
                {
                    notify(exe, element.OuterHtml);
                }
                catch { }
            }
            return link;
        }

        /// <summary>
        /// Prepares the URL.
        /// </summary>
        /// <param name="iUrl">The i URL.</param>
        /// <returns></returns>
        private string prepareUrl(string iUrl)
        {
            if (string.IsNullOrEmpty(iUrl))
                return null;

            if (string.IsNullOrEmpty(iUrl.Trim()))
                return null;

            string[] seq = iUrl.Split('.');

            string oUrl = null;

            switch (seq.Length - 1)
            {
                case 0:
                    return null;
                case 1:
                    oUrl = @"http://www." + iUrl;
                    break;
                case 2:
                    if (iUrl.Contains("http"))
                        oUrl = iUrl;
                    else
                        oUrl = @"http://" + iUrl;
                    break;
                default:
                    oUrl = iUrl;
                    break;
            }

            return oUrl;
        }

        /// <summary>
        /// Trasforms the share point URL.
        /// </summary>
        /// <param name="url">The URL.</param>
        /// <returns></returns>
        private string TrasformSharePointUrl(string url)
        {
            url = url.Replace("Shared%20Documents/", "/");
            url = url.Replace(".aspx", "/Forms/AllItems.aspx");
            return url;
        }

        /// <summary>
        /// Retrieves the HTML.
        /// </summary>
        /// <param name="proxyUser">The proxy user.</param>
        /// <param name="proxyPassword">The proxy password.</param>
        /// <param name="url">The URL.</param>
        /// <returns></returns>
        public static bool TestUrl(string proxyUser, string proxyPassword, string url)
        {
            //WebClient object

            try
            {
                using (PersonalWebClient client = new PersonalWebClient(proxyUser, proxyPassword))
                // Retrieve resource as a stream
                using (Stream data = client.OpenRead(new Uri(url)))

                // Retrive the text
                using (StreamReader reader = new StreamReader(data))
                {
                    string htmlContent = reader.ReadToEnd();

                    // Call function to process HTML Content

                    // Cleanup
                    data.Close();
                    reader.Close();
                }
            }
            catch
            {
                return false;
            }
            return true;
        }
    }
}

Complete Code and Consultant

On request, I'll send the complete code and I'll support you a couple of hours, please send me something from my Amazon wish list: http://www.amazon.it/registry/wishlist/3DUGGYP0KMLF8

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

Share

About the Author

Elia Conchione
Software Developer (Senior) 42yo.com
Italy Italy
Elia Conchione is a Banking software engineer from 1986, living in Cittadella, Padova, Italy. Current programming interests include C, C++, C#, Linux, Unix, Mac OS X, Safe Code, encryption applications.
http://www.42yo.com
Follow on   Twitter

Comments and Discussions

 
-- There are no messages in this forum --
| Advertise | Privacy | Mobile
Web03 | 2.8.140814.1 | Last Updated 28 Jun 2014
Article Copyright 2014 by Elia Conchione
Everything else Copyright © CodeProject, 1999-2014
Terms of Service
Layout: fixed | fluid