In this post, we describe how to implement functions that help fetch information that can be used to study how well (or bad) the keywords are used on a given website in comparison with other (successful) competitors.

Update (June 25): some functions were modified to better fit the needs of the example application.

Update (June 27): the html code of Google links changes between the first page and the next ones. Hence the “old” code would only find links in the first page. The code has been modified to take this into account.

Update (June 28): shame on me, the google, bing and yahoo code for backlinks were buggy. Hopefully now that works !

Helpers

namespace Seo

open System.Text.RegularExpressions

///The Links will hold the basic information : description & url
[<Struct>]
type Link(title:string, url:string) =
  member x.Title = title
  member x.Url = url
  override x.ToString() = sprintf "Link{%s (%s)}" x.Title x.Url

///We override the System.Net.WebClient to be able to set the user agent upon creation
type WebClient(?encoding, ?userAgent) as client =
  inherit System.Net.WebClient()
  do
    userAgent |> Option.iter (fun agent -> client.Headers.Add( "user-agent", agent))
    encoding |> Option.iter (fun enc -> client.Encoding <- enc)

[<AutoOpen>]
module internal Helpers =

  //Find only one occurrence
  let (|ParseRegexOne|_|) regex str =
    let m = Regex.Match(str, regex, RegexOptions.Singleline ||| RegexOptions.IgnoreCase)
    if m.Success then
      Some (List.tail [ for x in m.Groups -> x.Value ])
    else
      None

  //Find all occurrences
  let (|ParseRegexMany|_|) regex str =
    let ms = Regex.Matches(str, regex, RegexOptions.Singleline ||| RegexOptions.IgnoreCase)
    let res =
      [ for m in ms do
          if m.Success then
            yield List.tail [ for x in m.Groups -> x.Value ]
      ]
    if List.length res > 0 then Some res else None

  let MozillaUserAgent =
    "Mozilla/5.0 (Windows; U; Windows NT 5.0; fr; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3"

  let MozillaGoogleToolbarAgent =
    "Mozilla/4.0 (compatible; GoogleToolbar 2.0.114-big; Windows XP 5.1)"

  //Exclude keywords less than this length
  let MIN_WORD_LENGTH = 3

  let isWordValid s = String.length s >= MIN_WORD_LENGTH

AsyncWorker

This code comes from Don Syme’s blog. It is used to perform and monitor asynchronous tasks in parallel.

namespace Seo

open System
open System.Threading
open System.IO
open Microsoft.FSharp.Control.WebExtensions

type AsyncWorker<'T>(jobs: seq<Async<'T>>) =

    // Capture the synchronization context to allow us to
    // raise events back on the GUI thread
    let syncContext =
      let x = System.Threading.SynchronizationContext.Current
      if x = null then new System.Threading.SynchronizationContext() else x

    // A standard helper to raise an event on the GUI thread
    let raiseEventOnGuiThread (event:Event<_>) args =
      syncContext.Post((fun _ -> event.Trigger args),state=null)

    // Each of these lines declares an F# event that we can raise
    let allCompleted  = new Event<'T[]>()
    let error         = new Event<System.Exception>()
    let canceled      = new Event<System.OperationCanceledException>()
    let jobCompleted  = new Event<int * 'T>()

    let cancellationCapability = new CancellationTokenSource() 

    /// Start an instance of the work
    member x.Start() =
        // Mark up the jobs with numbers
        let jobs = jobs |> Seq.mapi (fun i job -> (job,i+1))
        let work =
          Async.Parallel
            [ for (job,jobNumber) in jobs do
                yield async
                  { let! result = job
                    raiseEventOnGuiThread jobCompleted (jobNumber, result)
                    return result
                  }
            ]
        Async.StartWithContinuations
            ( work,
              (fun res -> raiseEventOnGuiThread allCompleted res),
              (fun exn -> raiseEventOnGuiThread error exn),
              (fun exn -> raiseEventOnGuiThread canceled exn ),
              cancellationCapability.Token
            )

    member x.CancelAsync() = cancellationCapability.Cancel()

    /// Raised when a particular job completes
    member x.JobCompleted  = jobCompleted.Publish

    /// Raised when all jobs complete
    member x.AllCompleted  = allCompleted.Publish

    /// Raised when the composition is cancelled successfully
    member x.Canceled   = canceled.Publish

    /// Raised when the composition exhibits an error
    member x.Error      = error.Publish

Density

The Density module is used to track how often a word appears within a given list of words.

namespace Seo

open System.Collections.Generic

type Density = {
  NumKeywords : int
  NumMatches : int
} with
    member x.Density = float x.NumMatches / float x.NumKeywords
    override x.ToString() =
      sprintf "{NumKeywords:%d NumMatches:%d Density:%0.2f}" x.NumKeywords x.NumMatches x.Density

[<CompilationRepresentation(CompilationRepresentationFlags.ModuleSuffix)>]
module Density =

  type t = Density

  let create k m =
    { NumKeywords = k
      NumMatches = m
    }

  let empty = create 0 0

  let all ws =
    let dict = Dictionary<_, _>()
    ws |> List.iter (fun w ->
      let ok, v = dict.TryGetValue(w)
      if ok then
        dict.[w] <- {v with NumMatches = v.NumMatches+1}
      else
        dict.[w] <- create ws.Length 1
    )
    dict

  let single target ws =
    let matches = ref 0
    let n = ws |> List.filter ((=) target) |> List.length
    { NumKeywords = List.length ws
      NumMatches = n
    }

  let merge d1 d2 =
    { NumKeywords = d1.NumKeywords + d2.NumKeywords
      NumMatches = d1.NumKeywords + d2.NumKeywords
    }

Keywords

We can now extract the keywords from a given webpage, and compute the density of the sought keywords.

namespace Seo

open System.Collections.Generic

//=======================================
//
//  Locations where the keywords are sought
//
//=======================================

//Source of the keywords we are interested in
type Location =
  | MetaDescription
  | MetaKeywords
  | Title
  | Body
  | H
  | Bold
  | Links
  | Tags
  with
    override x.ToString() =
      match x with
      | MetaDescription -> "meta description"
      | MetaKeywords -> "meta keywords"
      | Title -> "title"
      | Body -> "body"
      | H -> "h1-h2-h3"
      | Bold -> "b-em-bold style"
      | Links -> "links"
      | Tags -> "alt-title-description attributes"

//=======================================
//
//  Parsing keywords from html
//
//=======================================

module Keywords =

  let ALL_LOCATIONS = [
    MetaDescription
    MetaKeywords
    Title
    Body
    H
    Bold
    Links
    Tags
  ]

  let toComparable (s:string) = s.Trim().ToLower()

  let toWords s =
    if isWordValid s then
      s.Split([|' '; '\n'; '\t'; ','|])
        |> Array.map toComparable
        |> List.ofArray
        |> List.filter isWordValid
    else []

  let rec mergeResults xs acc =
    xs |> List.concat |> List.map toWords |> List.concat

  let parseMeta name res =
    let p1 = sprintf "<meta [^=]+=\"%s\"\s+content=\"([^\"]*)\"[^>]*>" name
    let p2 = sprintf "<meta [^=]+='%s'\s+content='([^\"]*)'[^>]*>" name
    match res with
    | ParseRegexOne p1 [x] -> toWords x
    | ParseRegexOne p2 [x] -> toWords x
    | _ -> []

  let parseTag tag res =
    let pattern = sprintf "<%s[^>]*>((?:(?!</%s>).)*)" tag tag
    match res with
    | ParseRegexMany pattern xs -> mergeResults xs []
    | _ -> []    

  let parseBoldStyle = function
    | ParseRegexMany "font-weight:bold[^>]+>((?:(?!</).)*)" xs -> mergeResults xs []
    | _ -> []

  let parseLinks = function
    | ParseRegexMany "<a href=\"[^\"]*\"[^>]*>((?:(?!</a>).)*)" xs -> mergeResults xs []
    | ParseRegexMany "<a href='[^']*'[^>]*>((?:(?!</a>).)*)" xs -> mergeResults xs []
    | _ -> []

  let parseAttribute attr res =
    let pattern1 = sprintf "<(?:(?!%s).)*((?:%s=\"([^\"]*)\")?)[^>]*>" attr attr
    let pattern2 = sprintf "<(?:(?!%s).)*((?:%s='([^']*)')?)[^>]*>" attr attr
    match res with
    | ParseRegexMany pattern1 xs -> mergeResults xs []
    | ParseRegexMany pattern2 xs -> mergeResults xs []
    | _ -> []

  let parse location html =
    match location with
    | MetaDescription -> parseMeta "description" html
    | MetaKeywords -> parseMeta "keywords" html
    | Title -> parseTag "title" html
    | Body -> parseTag "body" html
    | H ->
        [ parseTag "h1" html
          parseTag "h2" html
          parseTag "h3" html
        ] |> List.concat
    | Bold ->
        [ parseTag "b" html
          parseTag "em" html
          parseBoldStyle html
        ] |> List.concat
    | Links -> parseLinks html
    | Tags ->
        [ parseAttribute "alt" html
          parseAttribute "title" html
          parseAttribute "description" html
        ] |> List.concat    

  let parseMany locations html =
    let dict = Dictionary<_, _>()
    locations |> List.iter (fun location ->
      dict.Add(location, parse location html)
    )
    dict

  let density target location html =
    let ws = parse location html
    Density.single target ws

  let densityMany target locations html =
    let dict = Dictionary<_, _>()
    for KeyValue(location, ws) in parseMany locations html do
      let density = Density.single target ws
      dict.Add(location, density)
    dict

SearchEngine

This is the core of our tools.

namespace Seo

open System
open System.Collections.Generic
open System.Net
open System.Text

open Microsoft.FSharp.Control.WebExtensions

open Seo

type SearchEngine = {
  Name : string
  ResultsPerPage : uint32
  QueryUrl : string -> int -> string
  TryParseResultsCount : string -> option<uint32>
  ParseLinks : string -> int -> uint32 -> list<Link * uint32>
} with
    member x.NumPages maxResults =
      let res =
        if maxResults % x.ResultsPerPage <> 0u then
          maxResults / x.ResultsPerPage + 1u
        else
          maxResults / x.ResultsPerPage
      int res  

[<CompilationRepresentation(CompilationRepresentationFlags.ModuleSuffix)>]
module SearchEngine =
  type t = SearchEngine

  type result_t = uint32 * list<Link * uint32>

  let run (engine:t) q maxResults =
    async {
      let nFound = ref 0u
      let all = ref []

      if maxResults > 0u then
        let maxPages = engine.NumPages maxResults
        use webClient = new Seo.WebClient(encoding=Encoding.UTF8, userAgent=MozillaUserAgent)

        for page in 1 .. maxPages do
          let adjustedMaxResults = maxResults - (uint32 (page-1) * engine.ResultsPerPage)

          let url = engine.QueryUrl q page
          let uri = Uri(url)
          let! html = webClient.AsyncDownloadString(uri)

          if page = 1 then
            engine.TryParseResultsCount html |> Option.iter (fun n -> nFound := n)

          all := engine.ParseLinks html page adjustedMaxResults :: !all

      return (!nFound, List.concat !all)
    }

  //=====================================
  //
  //  Back links
  //  Indexed links
  //  Keywords search
  //
  //=====================================

  let backLinks engine url isPageOnly includingSite maxResults =
    let host = Uri(url).Host
    let url = if isPageOnly then url else host
    let exclusion = if not includingSite then sprintf "+-site:%s" host else ""
    let q = sprintf "\"%s\"%s" url exclusion
    run engine q maxResults

  let indexedLinks engine url maxResults =
    let host = Uri(url).Host
    let q = sprintf "*+site:%s" host
    run engine q maxResults

  let keywordMatches engine (w:string) maxResults =
    let q = w.Replace(" ", "+")
    run engine q maxResults

Google

An example implementation to fetch SEO information from Google.

In addition to a concrete implementation of the aforementionned SearchEngine, we use this module to define helper functions to fetch Google’s PageRank information.

namespace Seo

open System
open System.Collections.Generic
open System.IO
open System.Net

open Microsoft.FSharp.Control.WebExtensions

open Seo

module Google =

  //=====================================
  //
  //  PageRank
  //
  //=====================================  

  let receivedPageRank = new Event<string * int>()
  let triggerReceivedPageRank = receivedPageRank.Trigger
  let ReceivedPageRank = receivedPageRank.Publish

  let SEED = "Mining PageRank is AGAINST GOOGLE'S TERMS OF SERVICE. Yes, I'm talking to you, scammer."
  let SEED_LENGTH = SEED.Length

  let hashUrl url =
    let html = ref 0x01020345u
    for i in 0 .. String.length url - 1 do
      html := !html ^^^ (uint32 SEED.[i % SEED_LENGTH]) ^^^ (uint32 url.[i])
      html := (!html >>> 23) ||| (!html <<< 9)
      html := !html &&& 0xffffffffu
    sprintf "8%x" !html

  let pageRankUrl url =
    sprintf "http://toolbarqueries.google.com/search?client=navclient-auto&features=Rank:&q=info:%s&ch=%s"
      url (hashUrl url)

  let asyncPageRank url =
    async {
      let uri = Uri(pageRankUrl url)
      use webClient = new Seo.WebClient(userAgent=MozillaGoogleToolbarAgent)
      let! reply = webClient.AsyncDownloadString(uri)
      let rank =
        match reply with
        | ParseRegexOne "Rank_\d{1}:\d+:(\d+)" [pr] -> int pr
        | _ -> 0
      triggerReceivedPageRank (url, rank)
      return rank
    }

  //=====================================
  //
  //  Search engine
  //
  //=====================================

  let ROOT = "http://www.google.fr"

  let RESULTS_PER_PAGE = 100

  let SEARCH_ENGINE =
    let queryUrl q page =
      sprintf "%s/search?q=%s&num=%d&start=%d" ROOT q RESULTS_PER_PAGE (RESULTS_PER_PAGE * (page-1))

    let tryParseResultsCount (html:string) =
      match html with
      | ParseRegexOne "resultStats[^\d]*((?:(?!\s).)*)" [results] ->
          let ok, v = UInt32.TryParse(results.Trim().Replace("&nbsp;","").Replace(" ","").Replace(",",""))
          if ok then Some v else None
      | _ -> None

    let isUrlDangerous (url:string) =
      url.IndexOf("interstitial") <> -1

    let rec parseLinksEntry maxResults results xs rank =
      match xs with
      | [url; desc] :: tl when List.length results < maxResults ->
          if isWordValid desc && not (isUrlDangerous url) then
            let title = desc.Replace("<em>", "").Replace("</em>", "")
            let newResults = (Link(title, url), rank)::results
            parseLinksEntry maxResults newResults tl (rank+1u)
          else
            parseLinksEntry maxResults results tl (rank+1u)
      | _ -> results

    let parseLinks (html:string) currentPage maxResults =
      let pattern =
        if currentPage = 1 then
          "<h3 class=\"r[^\"]*\"><a href=\"([^\"]*)\"[^\)]*\)\">((?:(?!</a>).)*)</a></h3>"
        else
          "<h3 class=\"r[^\"]*\"><a href=\"([^\"]*)\"[^>]*>((?:(?!</a>).)*)</a></h3>"

      match html with
      | ParseRegexMany pattern xs ->
          let rank =
            let start = uint32 <| currentPage - 1
            let rpp = uint32 RESULTS_PER_PAGE
            start * rpp + 1u
          parseLinksEntry maxResults [] xs rank
      | _ -> []

    { Name = "Google"
      ResultsPerPage = RESULTS_PER_PAGE
      QueryUrl = queryUrl
      TryParseResultsCount = tryParseResultsCount
      ParseLinks = parseLinks
    }

Bing

An example implementation to fetch SEO information from Bing.

namespace Seo

open System
open System.Collections.Generic
open System.IO
open System.Net

open Seo

module Bing =

  //=====================================
  //
  //  Search engine
  //
  //=====================================

  let ROOT = "http://www.bing.com"

  let RESULTS_PER_PAGE = 10

  let SEARCH_ENGINE =
    let queryUrl q page =
      sprintf "%s/search?q=%s&first=%d" ROOT q (RESULTS_PER_PAGE * (page-1) + 1)

    let tryParseResultsCount (html:string) =
      let pattern =
        "<span class=\"sb_count\" id=\"count\">(?:(?![\d&#160;]+</span>).)*([\d&#160;]+)"
      match html with
      | ParseRegexOne pattern [results] ->
          let ok, v =
            let res =
              results.Trim().Replace("&#160;","").Replace("&nbsp;","").Replace(" ","").Replace(",","")
            UInt32.TryParse(res)
          if ok then Some v else None
      | _ -> None

    let rec parseLinksEntry maxResults results xs rank =
      match xs with
      | [url; desc] :: tl when List.length results < maxResults ->
          if isWordValid desc then
            let newResults = (Link(desc, url), rank)::results
            parseLinksEntry maxResults newResults tl (rank+1u)
          else
            parseLinksEntry maxResults results tl (rank+1u)
      | _ -> results

    let parseLinks (html:string) currentPage maxResults =
      match html with
      | ParseRegexMany "<h3><a href=\"([^\"]*)\"[^>]*>((?:(?!</a>).)*)</a></h3>" xs ->
          let rank =
            let start = uint32 <| currentPage - 1
            let rpp = uint32 RESULTS_PER_PAGE
            start * rpp
          parseLinksEntry maxResults [] xs rank
      | _ -> []

    { Name = "Bing"
      ResultsPerPage = RESULTS_PER_PAGE
      QueryUrl = queryUrl
      TryParseResultsCount = tryParseResultsCount
      ParseLinks = parseLinks
    }  

Yahoo

An example implementation to fetch SEO information from Yahoo.

namespace Seo

open System
open System.Collections.Generic
open System.IO
open System.Net

open Seo

module Yahoo =

  //=====================================
  //
  //  Search engine
  //
  //=====================================

  let ROOT = "http://search.yahoo.com"

  let RESULTS_PER_PAGE = 10

  let SEARCH_ENGINE =
    let queryUrl q page =
      sprintf "%s/search?p=%s&b=%d" ROOT q (RESULTS_PER_PAGE * (page-1) + 1)

    let tryParseResultsCount (html:string) =
      let pattern = "strong id=\"resultCount\">([^<]*)</strong>"
      match html with
      | ParseRegexOne pattern [results] ->
          let ok, v =
            let res = results.Trim().Replace(",","")
            UInt32.TryParse(res)
          if ok then Some v else None
      | _ -> None

    let rec parseLinksEntry maxResults results xs rank =
      match xs with
      | [url; desc] :: tl when List.length results < maxResults ->
          if isWordValid desc then
            let title = desc.Replace("<b>", "").Replace("</b>", "")
            let newResults = (Link(title, url), rank)::results
            parseLinksEntry maxResults newResults tl (rank+1u)
          else
            parseLinksEntry maxResults results tl (rank+1u)
      | _ -> results

    let parseLinks (html:string) currentPage maxResults =
      match html with
      | ParseRegexMany "<a class=\"yschttl spt\" href=\"([^\"]*)\"[^>]*>((?:(?!</a>).)*)</a>" xs ->
          printfn "maxResults:%d parsed:%A" maxResults xs
          let rank =
            let start = uint32 <| currentPage - 1
            let rpp = uint32 RESULTS_PER_PAGE
            start * rpp
          parseLinksEntry maxResults [] xs rank
      | _ -> []  

    { Name = "Yahoo"
      ResultsPerPage = RESULTS_PER_PAGE
      QueryUrl = queryUrl
      TryParseResultsCount = tryParseResultsCount
      ParseLinks = parseLinks
    }  

Comments are closed.