Click here to Skip to main content
15,886,199 members
Articles / General Programming / Algorithms

XmlToXsd - A Better Schema Generator

Rate me:
Please Sign up or sign in to vote.
4.93/5 (13 votes)
7 Dec 2010CPOL2 min read 42.7K   1.7K   44  
Build better schema for rapid data model prototyping.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Xml.Linq;

namespace Xml2Xsd
{
    public class XsdFromXml
    {
        class NodeInfo
        {
            public NodeInfo()
            {
                ValuesForTypeGuess = new List<string>();
            }

            public XElement Schema { get; set; }

            public bool Repeats { get; set; }
            public string TypeGuess { get; set; }

            public IList<string> ValuesForTypeGuess { get; set; }
        }

        static Dictionary<string, NodeInfo> xpaths = new Dictionary<string, NodeInfo>();
        static Dictionary<string, bool> elements = new Dictionary<string, bool>();
        static List<string> recurseElements = new List<string>();

        static XNamespace target;
        static readonly XNamespace xs = XNamespace.Get("http://www.w3.org/2001/XMLSchema");

        public static XDocument Generate(XDocument content, string targetNamespace)
        {
            xpaths.Clear();
            elements.Clear();
            recurseElements.Clear();

            RecurseAllXPaths(string.Empty, content.Elements().First());

            target = XNamespace.Get(targetNamespace);

            var compTypes = xpaths.Select(k => k.Key)
                .OrderBy(o => o)
                .Select(k => ComplexTypeElementFromXPath(k)).Where(q => null != q).ToList();

            var baseElementType = compTypes.First().Attribute("name").Value;
            var baseElementName = baseElementType.Substring(0, baseElementType.Length - 4); // trim off the "Info"

            compTypes.Add(new XElement(xs + "element",
                new XAttribute("name", baseElementName),
                new XAttribute("type", baseElementType)));

            // The first one is our root element... it needs to be extracted and massage
            // compTypes[0] = compTypes.First().Element(xs + "sequence").Element(xs + "element");

            return XDocument.Parse(
                // Warning: Namespaces are tricky/hinted here, be careful
                new XDocument(new XElement(target + "schema",
                // Why 'qualified'?
                // All "qualified" elements and attributes are in the targetNamespace of the
                // schema and all "unqualified" elements and attributes are in no namespace.
                //  All global elements and attributes are qualified.
                new XAttribute("elementFormDefault", "qualified"),

                // Specify the target namespace, you will want this for schema validation
                new XAttribute("targetNamespace", targetNamespace),

                // hint to xDocument that we want the xml schema namespace to be called 'xs'
                new XAttribute(XNamespace.Xmlns + "xs", "http://www.w3.org/2001/XMLSchema"),
                compTypes)).ToString().Replace("schema", "xs:schema"));
        }

        static void RecurseAllXPaths(string xpath, XElement elem)
        {
            var lclName = elem.Name.LocalName;
            xpath = string.Format("{0}/{1}", xpath, lclName);
            var missingXpath = !xpaths.ContainsKey(xpath);

            var hasLcl = elements.ContainsKey(lclName);


            // Check for recursion in the element name (same name different level)
            if (hasLcl && missingXpath)
                recurseElements.Add(lclName);
            else if (!hasLcl)
                elements.Add(lclName, true);

            // if it's not in the xpath, then add it.
            if (missingXpath)
                xpaths.Add(xpath, new NodeInfo());
            else
                xpaths[xpath].Repeats = true;

            if (!elem.HasElements && !string.IsNullOrEmpty(elem.Value))
                xpaths[xpath].ValuesForTypeGuess.Add(elem.Value);

            elem.Attributes().Where(q=>q.Name.LocalName != "xmlns").ToList().ForEach(attr =>
            {
                var xpathAttr = string.Format("{0}/@{1}", xpath, attr.Name);
                // [ToDo] - Add a data type guess here
                if (!xpaths.ContainsKey(xpathAttr))
                    xpaths.Add(xpathAttr, new NodeInfo());
                
                if (!string.IsNullOrEmpty(attr.Value))
                    xpaths[xpathAttr].ValuesForTypeGuess.Add(attr.Value);
            });

            elem.Elements().ToList().ForEach(fe => RecurseAllXPaths(xpath, fe));
        }


        private static XElement ComplexTypeElementFromXPath(string xp)
        {
            var parts = xp.Split('/');
            var nodeName = parts.Last();
            var isAttr = nodeName.StartsWith("@");
            var parent = ParentElementByXPath(parts);

            var node = xpaths[xp];

            if (node.ValuesForTypeGuess.Count() > 0)
                node.TypeGuess = TypeGuessFromValues(node.ValuesForTypeGuess);

            return (isAttr) ? BuildAttributeSchema(nodeName, node, parent) : 
                BuildElementSchema(nodeName, node, parent);
        }

        private static XElement BuildElementSchema(string elemName, NodeInfo nodeInfo, NodeInfo parentInfo)
        {
            XElement parent = (null == parentInfo) ? null : parentInfo.Schema;
            XElement seqElem = null;
            if (null != parent)
            {
                seqElem = parent.Element(xs + "sequence");

                if (null == seqElem && null != parent)
                    parent.AddFirst(seqElem = new XElement(xs + "sequence"));
            }
            else
            {
                seqElem = new XElement(xs + "sequence");
            }

            var elemNameInfo = elemName + "Info";

            var hasKids = null == nodeInfo || string.IsNullOrEmpty(nodeInfo.TypeGuess);

            var elem0 = new XElement(xs + "element",
                        new XAttribute("name", elemName),
                        new XAttribute("type", hasKids ? elemNameInfo : nodeInfo.TypeGuess));

            if (null != nodeInfo && nodeInfo.Repeats)
                elem0.Add(new XAttribute("maxOccurs", "unbounded"));

            seqElem.Add(elem0); // add the ref to the existing sequence

            nodeInfo.Schema = new XElement(xs + "complexType",
                    new XAttribute("name", elemNameInfo));

            return hasKids ? nodeInfo.Schema : null;
        }

        private static XElement BuildAttributeSchema(string attrName, NodeInfo nodeInfo, NodeInfo parentInfo)
        {
            XElement parent = parentInfo.Schema;
            var elem0 = new XElement(xs + "attribute",
                new XAttribute("name", attrName.TrimStart('@')),
                new XAttribute("type", nodeInfo.TypeGuess ?? "xs:string"));
            
            if (null != parent)
                parent.Add(elem0);

            nodeInfo.Schema = elem0;

            return null;
        }

        private static NodeInfo ParentElementByXPath(IEnumerable<string> parts)
        {
            var parentElemXPath = 
                string.Join("/", parts.Take(parts.Count()-1).ToArray());

            NodeInfo parentNode;
            if (xpaths.TryGetValue(parentElemXPath, out parentNode))
                return parentNode;
            else
                return null;
        }

        /// <summary>
        ///  given a list of string values (of an attribute or element value), make a guess at the xs:type
        /// </summary>
        /// <param name="values"></param>
        /// <returns></returns>
        private static string TypeGuessFromValues(IEnumerable<string> values)
        {
            var firstTen = values.Take(10);  // In case this is a real world sample, just take 10 values

            int iVal;
            if (firstTen.All(fn => int.TryParse(fn, out iVal)))
                return "xs:int";
            decimal dVal;
            if (firstTen.All(fn => decimal.TryParse(fn, out dVal)))
                return "xs:decimal";
            DateTime dtVal;
            if (firstTen.All(fn => DateTime.TryParse(fn, out dtVal)))
                return "xs:date";
            bool bVal;
            if (firstTen.All(fn => bool.TryParse(fn, out bVal)))
                return "xs:boolean";

            return "xs:string";
        }
    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Engineer Big Company
United States United States
My professional career began as a developer fixing bugs on Microsoft Word97 and I've been fixing bad habits ever since. Now I do R&D work writing v1 line of business applications mostly in C#/.Net.

I've been an avid pilot/instructor for 13+ years, I've built two airplanes and mostly fly gliders now for fun. I commute in an all-electric 1986 BMW 325 conversion.

I'd like to get back to my academic roots of programming 3D analysis applications to organize complex systems.

Comments and Discussions