13 January 2013
18:48
# Introduction #
[This article](http://www.codeproject.com/Tips/526908/A-Simple-Parser-That-Converts-HTML-from-OneNote-to) introduces OneNote2Markdown, a parser I made that converts the html file generated from OneNote (by sending to Word and save as html) to Markdown format, which can then be translated to a cleaner html by [any online Markdown parser](http://daringfireball.net/projects/markdown/dingus) later.
* Written in F#, the tool works with OneNote 2010 and Word 2010. It handles normal paragraphs, headings, links, lists, inlined code and code blocks only.
* The tool reads from "input.html" and writes to "output.txt".
* The source code of the latest version can be viewed at [Bitbucket](https://bitbucket.org/colinfang/onenote2markdown/src). It requires [HtmlAgilityPack](http://htmlagilitypack.codeplex.com/) to compile.
* The example pack contains this article in docx, html & Markdown formats, which would give a basic demonstration on how the tool works.
# Background #
I tend to take notes in OneNote. When I first time try to submit an article which is composed in OneNote, it is really a hassle to adapt the content to the template in [Code Project](http://www.codeproject.com) manually. So I decided to make a parser which would automate most of the formatting work for me.
# Implementation Overview #
## Preparations ##
* An Active Pattern to pattern match if a text node has a certain ancestor such as `<b>` or `<i>`.
let (|HasAncestor|) tag (node: HtmlNode) =
node.Ancestors(tag) |> Seq.isEmpty |> not
* A function to dig up a certain CSS property that a text node inherits from `style` attribute.
let getPartialStyle cssProperty (node: HtmlNode) =
let predicate node =
// "property1:value1;property2:value2"
let myMatch = Regex.Match(getStyle node, sprintf "%s:(.+?)(;|$)" cssProperty)
if myMatch.Success then
Some myMatch.Groups.[1].Value
else None
// Gets the value for the closest cssProperty.
node.Ancestors("span") |> Seq.tryPick predicate
* A function to get a certain CSS property of a node from `style` attribute.
let getPartialStyleSelf cssProperty (node: HtmlNode) =
let myMatch = Regex.Match(getStyle node, sprintf "%s:(.+?)(;|$)" cssProperty)
if myMatch.Success then
Some myMatch.Groups.[1].Value
else
None
## Headings ##
* Determines the heading type of a paragraph by checking its `font-size` & `color` CSS property as well as if it has `<b>` or `<i>` ancestor.
match font, color, node with
| Some "16.0pt", Some "#17365D", (HasAncestor "b" true) -> H 1
| Some "13.0pt", Some "#366092", (HasAncestor "b" true) -> H 2
| Some "11.0pt", Some "#366092", (HasAncestor "b" true) & (HasAncestor "i" false) -> H 3
| Some "11.0pt", Some "#366092", (HasAncestor "b" true) & (HasAncestor "i" true) -> H 4
| Some "11.0pt", Some "#366092", (HasAncestor "b" false) & (HasAncestor "i" false) -> H 5
| Some "11.0pt", Some "#366092", (HasAncestor "b" false) & (HasAncestor "i" true) -> H 6
| _ -> Normal
* Uses` ## heading ##` syntax so that Markdown parser doesn't eat the last `#` contained in the heading.
let headIt n text =
String.Format("{1} {0} {1}", text, (String (Array.create n '#')))
## Code ##
* Any text whose font is `Consolas` is considered as code, otherwise not.
match getPartialStyle "font-family" textNode with
| Some "Consolas" -> varIt text
| _ -> text
* Simplifies Markdown syntax by combining several inlined code pieces into one if they are separated by white-spaces (e.g. a b -> a b). Preserves the leading spaces so as to protect indentations and blank lines with code blocks (anything inside is non-trivial and will not be removed later, e.g. a -> a). However, the limitation exists that the text itself cannot contain `.
let simplifyVar (text: string) =
` Regex.Replace(text, @"(?<=.)`(\s*)`", "$1") `
* Differentiates code blocks from inlined code.
let tryGetPureCode (text: string) =
` let myMatch = (Regex.Match(text, @"^`([^`]*)`$"))`
if myMatch.Success then
Some (myMatch.Result "$1")
else
None
## Lists ##
* Distinguishes between ordered lists and unordered lists by the symbol. Lists without symbols are considered as normal paragraphs without indentation.
let listIt x text =
match x with
| "o" | "·" -> sprintf "* %s" text
| _ -> sprintf "1. %s" text
* Gets the indentation by `margin-left:54.0pt` CSS property.
let getIndent (node: HtmlNode) =
let getMargin (x: string) =
let unit = 27 // each level is 27
let array = x.Split '.'
let (success, x) = Int32.TryParse array.[0]
if success then x / unit
else failwith "indent parse error!"
match getPartialStyleSelf "margin-left" node with
| Some x -> getMargin x
| None -> 0
## Links ##
* Checks if a piece of text contains the link by looking for `<a>` in its ancestors.
match textNode with
| (HasAncestor "a" true) ->
let ancestor_a = textNode.Ancestors("a") |> Seq.head
linkIt text (ancestor_a.GetAttributeValue("href", "none"))
| _ -> text
## Finalization ##
* Gets the correct indentations and paragraph spacing for the whole content.
/// Assumes in OneNote there are no spaces in front of a code block (indent by tabs).
/// Assumes in OneNote the internal indentations of a code block are either all tabs or all spaces, never mixed.
let review paragraphs =
// indentOffset is used for nesting indentations.
// If a benchmark line with indentation a, actually indents x, we set indentOffset = a - x.
// So any line with indentation b, does actually indent b - indentOffset = b - a + x.
let mutable listIndentOffset = 0
let mutable codeIndentOffset = 0
let oldCopy = paragraphs |> Seq.toArray
let newCopy = Array.zeroCreate oldCopy.Length
// Looks at the current paragraph and the previous paragraph.
// I don't care about the first paragraph as it will be the title.
// Uses "\r\n" so that Notepad reads correctly.
for i in 1 .. oldCopy.Length - 1 do
match oldCopy.[i - 1], oldCopy.[i] with
| (Code _ | Listing _) , (Heading text | Basic text) ->
// Code block / list block ends, prepends and appends new lines, and resets both indentOffsets.
newCopy.[i] <- sprintf "\r\n%s\r\n" text
listIndentOffset <- 0
codeIndentOffset <- 0
| (Heading _ | Basic _), (Heading text | Basic text) ->
// Appends a new line.
newCopy.[i] <- sprintf "%s\r\n" text
| Code (_, a) , Code (text, b) ->
// Don't add a new line in between code blocks.
newCopy.[i] <- indentIt (b - codeIndentOffset) text
| (Heading _ | Basic _), Code (text, b) ->
// Code block starts, cache codeIndentOffset
// Indents 1 level only as Heading or Basic indents none.
newCopy.[i] <- indentIt 1 text
codeIndentOffset <- b - 1
| Listing (_, a) , Code (text, b) ->
// Code block within a list requires 1 additional level on top of the list indentation.
// Code block starts, cache codeIndentOffset.
// Prepends a new line.
newCopy.[i] <- sprintf "\r\n%s" (indentIt (b - listIndentOffset + 1) text)
codeIndentOffset <- listIndentOffset - 1
| Listing (_, a) , Listing (text, b) ->
// Don't add a new line in between list blocks.
newCopy.[i] <- indentIt (b - listIndentOffset) text
| Code (_, a) , Listing (text, b) ->
// Code block ends, reset codeIndentOffset.
// Prepends a new line.
codeIndentOffset <- 0
newCopy.[i] <- sprintf "\r\n%s" (indentIt (b - listIndentOffset) text)
| (Heading _ | Basic _), Listing (text, b) ->
// List block starts, cache listIndentOffset
listIndentOffset <- b
newCopy.[i] <- text
newCopy