In this post, we describe how to implement functions that help fetch information that can be used to study how well (or bad) the keywords are used on a given website in comparison with other (successful) competitors.
Update (June 25): some functions were modified to better fit the needs of the example application.
Update (June 27): the html code of Google links changes between the first page and the next ones. Hence the “old” code would only find links in the first page. The code has been modified to take this into account.
Update (June 28): shame on me, the google, bing and yahoo code for backlinks were buggy. Hopefully now that works !
Helpers
namespace Seo open System.Text.RegularExpressions ///The Links will hold the basic information : description & url [<Struct>] type Link(title:string, url:string) = member x.Title = title member x.Url = url override x.ToString() = sprintf "Link{%s (%s)}" x.Title x.Url ///We override the System.Net.WebClient to be able to set the user agent upon creation type WebClient(?encoding, ?userAgent) as client = inherit System.Net.WebClient() do userAgent |> Option.iter (fun agent -> client.Headers.Add( "user-agent", agent)) encoding |> Option.iter (fun enc -> client.Encoding <- enc) [<AutoOpen>] module internal Helpers = //Find only one occurrence let (|ParseRegexOne|_|) regex str = let m = Regex.Match(str, regex, RegexOptions.Singleline ||| RegexOptions.IgnoreCase) if m.Success then Some (List.tail [ for x in m.Groups -> x.Value ]) else None //Find all occurrences let (|ParseRegexMany|_|) regex str = let ms = Regex.Matches(str, regex, RegexOptions.Singleline ||| RegexOptions.IgnoreCase) let res = [ for m in ms do if m.Success then yield List.tail [ for x in m.Groups -> x.Value ] ] if List.length res > 0 then Some res else None let MozillaUserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.0; fr; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3" let MozillaGoogleToolbarAgent = "Mozilla/4.0 (compatible; GoogleToolbar 2.0.114-big; Windows XP 5.1)" //Exclude keywords less than this length let MIN_WORD_LENGTH = 3 let isWordValid s = String.length s >= MIN_WORD_LENGTH
AsyncWorker
This code comes from Don Syme’s blog. It is used to perform and monitor asynchronous tasks in parallel.
namespace Seo open System open System.Threading open System.IO open Microsoft.FSharp.Control.WebExtensions type AsyncWorker<'T>(jobs: seq<Async<'T>>) = // Capture the synchronization context to allow us to // raise events back on the GUI thread let syncContext = let x = System.Threading.SynchronizationContext.Current if x = null then new System.Threading.SynchronizationContext() else x // A standard helper to raise an event on the GUI thread let raiseEventOnGuiThread (event:Event<_>) args = syncContext.Post((fun _ -> event.Trigger args),state=null) // Each of these lines declares an F# event that we can raise let allCompleted = new Event<'T[]>() let error = new Event<System.Exception>() let canceled = new Event<System.OperationCanceledException>() let jobCompleted = new Event<int * 'T>() let cancellationCapability = new CancellationTokenSource() /// Start an instance of the work member x.Start() = // Mark up the jobs with numbers let jobs = jobs |> Seq.mapi (fun i job -> (job,i+1)) let work = Async.Parallel [ for (job,jobNumber) in jobs do yield async { let! result = job raiseEventOnGuiThread jobCompleted (jobNumber, result) return result } ] Async.StartWithContinuations ( work, (fun res -> raiseEventOnGuiThread allCompleted res), (fun exn -> raiseEventOnGuiThread error exn), (fun exn -> raiseEventOnGuiThread canceled exn ), cancellationCapability.Token ) member x.CancelAsync() = cancellationCapability.Cancel() /// Raised when a particular job completes member x.JobCompleted = jobCompleted.Publish /// Raised when all jobs complete member x.AllCompleted = allCompleted.Publish /// Raised when the composition is cancelled successfully member x.Canceled = canceled.Publish /// Raised when the composition exhibits an error member x.Error = error.Publish
Density
The Density module is used to track how often a word appears within a given list of words.
namespace Seo open System.Collections.Generic type Density = { NumKeywords : int NumMatches : int } with member x.Density = float x.NumMatches / float x.NumKeywords override x.ToString() = sprintf "{NumKeywords:%d NumMatches:%d Density:%0.2f}" x.NumKeywords x.NumMatches x.Density [<CompilationRepresentation(CompilationRepresentationFlags.ModuleSuffix)>] module Density = type t = Density let create k m = { NumKeywords = k NumMatches = m } let empty = create 0 0 let all ws = let dict = Dictionary<_, _>() ws |> List.iter (fun w -> let ok, v = dict.TryGetValue(w) if ok then dict.[w] <- {v with NumMatches = v.NumMatches+1} else dict.[w] <- create ws.Length 1 ) dict let single target ws = let matches = ref 0 let n = ws |> List.filter ((=) target) |> List.length { NumKeywords = List.length ws NumMatches = n } let merge d1 d2 = { NumKeywords = d1.NumKeywords + d2.NumKeywords NumMatches = d1.NumKeywords + d2.NumKeywords }
Keywords
We can now extract the keywords from a given webpage, and compute the density of the sought keywords.
namespace Seo open System.Collections.Generic //======================================= // // Locations where the keywords are sought // //======================================= //Source of the keywords we are interested in type Location = | MetaDescription | MetaKeywords | Title | Body | H | Bold | Links | Tags with override x.ToString() = match x with | MetaDescription -> "meta description" | MetaKeywords -> "meta keywords" | Title -> "title" | Body -> "body" | H -> "h1-h2-h3" | Bold -> "b-em-bold style" | Links -> "links" | Tags -> "alt-title-description attributes" //======================================= // // Parsing keywords from html // //======================================= module Keywords = let ALL_LOCATIONS = [ MetaDescription MetaKeywords Title Body H Bold Links Tags ] let toComparable (s:string) = s.Trim().ToLower() let toWords s = if isWordValid s then s.Split([|' '; '\n'; '\t'; ','|]) |> Array.map toComparable |> List.ofArray |> List.filter isWordValid else [] let rec mergeResults xs acc = xs |> List.concat |> List.map toWords |> List.concat let parseMeta name res = let p1 = sprintf "<meta [^=]+=\"%s\"\s+content=\"([^\"]*)\"[^>]*>" name let p2 = sprintf "<meta [^=]+='%s'\s+content='([^\"]*)'[^>]*>" name match res with | ParseRegexOne p1 [x] -> toWords x | ParseRegexOne p2 [x] -> toWords x | _ -> [] let parseTag tag res = let pattern = sprintf "<%s[^>]*>((?:(?!</%s>).)*)" tag tag match res with | ParseRegexMany pattern xs -> mergeResults xs [] | _ -> [] let parseBoldStyle = function | ParseRegexMany "font-weight:bold[^>]+>((?:(?!</).)*)" xs -> mergeResults xs [] | _ -> [] let parseLinks = function | ParseRegexMany "<a href=\"[^\"]*\"[^>]*>((?:(?!</a>).)*)" xs -> mergeResults xs [] | ParseRegexMany "<a href='[^']*'[^>]*>((?:(?!</a>).)*)" xs -> mergeResults xs [] | _ -> [] let parseAttribute attr res = let pattern1 = sprintf "<(?:(?!%s).)*((?:%s=\"([^\"]*)\")?)[^>]*>" attr attr let pattern2 = sprintf "<(?:(?!%s).)*((?:%s='([^']*)')?)[^>]*>" attr attr match res with | ParseRegexMany pattern1 xs -> mergeResults xs [] | ParseRegexMany pattern2 xs -> mergeResults xs [] | _ -> [] let parse location html = match location with | MetaDescription -> parseMeta "description" html | MetaKeywords -> parseMeta "keywords" html | Title -> parseTag "title" html | Body -> parseTag "body" html | H -> [ parseTag "h1" html parseTag "h2" html parseTag "h3" html ] |> List.concat | Bold -> [ parseTag "b" html parseTag "em" html parseBoldStyle html ] |> List.concat | Links -> parseLinks html | Tags -> [ parseAttribute "alt" html parseAttribute "title" html parseAttribute "description" html ] |> List.concat let parseMany locations html = let dict = Dictionary<_, _>() locations |> List.iter (fun location -> dict.Add(location, parse location html) ) dict let density target location html = let ws = parse location html Density.single target ws let densityMany target locations html = let dict = Dictionary<_, _>() for KeyValue(location, ws) in parseMany locations html do let density = Density.single target ws dict.Add(location, density) dict
SearchEngine
This is the core of our tools.
namespace Seo open System open System.Collections.Generic open System.Net open System.Text open Microsoft.FSharp.Control.WebExtensions open Seo type SearchEngine = { Name : string ResultsPerPage : uint32 QueryUrl : string -> int -> string TryParseResultsCount : string -> option<uint32> ParseLinks : string -> int -> uint32 -> list<Link * uint32> } with member x.NumPages maxResults = let res = if maxResults % x.ResultsPerPage <> 0u then maxResults / x.ResultsPerPage + 1u else maxResults / x.ResultsPerPage int res [<CompilationRepresentation(CompilationRepresentationFlags.ModuleSuffix)>] module SearchEngine = type t = SearchEngine type result_t = uint32 * list<Link * uint32> let run (engine:t) q maxResults = async { let nFound = ref 0u let all = ref [] if maxResults > 0u then let maxPages = engine.NumPages maxResults use webClient = new Seo.WebClient(encoding=Encoding.UTF8, userAgent=MozillaUserAgent) for page in 1 .. maxPages do let adjustedMaxResults = maxResults - (uint32 (page-1) * engine.ResultsPerPage) let url = engine.QueryUrl q page let uri = Uri(url) let! html = webClient.AsyncDownloadString(uri) if page = 1 then engine.TryParseResultsCount html |> Option.iter (fun n -> nFound := n) all := engine.ParseLinks html page adjustedMaxResults :: !all return (!nFound, List.concat !all) } //===================================== // // Back links // Indexed links // Keywords search // //===================================== let backLinks engine url isPageOnly includingSite maxResults = let host = Uri(url).Host let url = if isPageOnly then url else host let exclusion = if not includingSite then sprintf "+-site:%s" host else "" let q = sprintf "\"%s\"%s" url exclusion run engine q maxResults let indexedLinks engine url maxResults = let host = Uri(url).Host let q = sprintf "*+site:%s" host run engine q maxResults let keywordMatches engine (w:string) maxResults = let q = w.Replace(" ", "+") run engine q maxResults
An example implementation to fetch SEO information from Google.
In addition to a concrete implementation of the aforementionned SearchEngine, we use this module to define helper functions to fetch Google’s PageRank information.
namespace Seo open System open System.Collections.Generic open System.IO open System.Net open Microsoft.FSharp.Control.WebExtensions open Seo module Google = //===================================== // // PageRank // //===================================== let receivedPageRank = new Event<string * int>() let triggerReceivedPageRank = receivedPageRank.Trigger let ReceivedPageRank = receivedPageRank.Publish let SEED = "Mining PageRank is AGAINST GOOGLE'S TERMS OF SERVICE. Yes, I'm talking to you, scammer." let SEED_LENGTH = SEED.Length let hashUrl url = let html = ref 0x01020345u for i in 0 .. String.length url - 1 do html := !html ^^^ (uint32 SEED.[i % SEED_LENGTH]) ^^^ (uint32 url.[i]) html := (!html >>> 23) ||| (!html <<< 9) html := !html &&& 0xffffffffu sprintf "8%x" !html let pageRankUrl url = sprintf "http://toolbarqueries.google.com/search?client=navclient-auto&features=Rank:&q=info:%s&ch=%s" url (hashUrl url) let asyncPageRank url = async { let uri = Uri(pageRankUrl url) use webClient = new Seo.WebClient(userAgent=MozillaGoogleToolbarAgent) let! reply = webClient.AsyncDownloadString(uri) let rank = match reply with | ParseRegexOne "Rank_\d{1}:\d+:(\d+)" [pr] -> int pr | _ -> 0 triggerReceivedPageRank (url, rank) return rank } //===================================== // // Search engine // //===================================== let ROOT = "http://www.google.fr" let RESULTS_PER_PAGE = 100 let SEARCH_ENGINE = let queryUrl q page = sprintf "%s/search?q=%s&num=%d&start=%d" ROOT q RESULTS_PER_PAGE (RESULTS_PER_PAGE * (page-1)) let tryParseResultsCount (html:string) = match html with | ParseRegexOne "resultStats[^\d]*((?:(?!\s).)*)" [results] -> let ok, v = UInt32.TryParse(results.Trim().Replace(" ","").Replace(" ","").Replace(",","")) if ok then Some v else None | _ -> None let isUrlDangerous (url:string) = url.IndexOf("interstitial") <> -1 let rec parseLinksEntry maxResults results xs rank = match xs with | [url; desc] :: tl when List.length results < maxResults -> if isWordValid desc && not (isUrlDangerous url) then let title = desc.Replace("<em>", "").Replace("</em>", "") let newResults = (Link(title, url), rank)::results parseLinksEntry maxResults newResults tl (rank+1u) else parseLinksEntry maxResults results tl (rank+1u) | _ -> results let parseLinks (html:string) currentPage maxResults = let pattern = if currentPage = 1 then "<h3 class=\"r[^\"]*\"><a href=\"([^\"]*)\"[^\)]*\)\">((?:(?!</a>).)*)</a></h3>" else "<h3 class=\"r[^\"]*\"><a href=\"([^\"]*)\"[^>]*>((?:(?!</a>).)*)</a></h3>" match html with | ParseRegexMany pattern xs -> let rank = let start = uint32 <| currentPage - 1 let rpp = uint32 RESULTS_PER_PAGE start * rpp + 1u parseLinksEntry maxResults [] xs rank | _ -> [] { Name = "Google" ResultsPerPage = RESULTS_PER_PAGE QueryUrl = queryUrl TryParseResultsCount = tryParseResultsCount ParseLinks = parseLinks }
Bing
An example implementation to fetch SEO information from Bing.
namespace Seo open System open System.Collections.Generic open System.IO open System.Net open Seo module Bing = //===================================== // // Search engine // //===================================== let ROOT = "http://www.bing.com" let RESULTS_PER_PAGE = 10 let SEARCH_ENGINE = let queryUrl q page = sprintf "%s/search?q=%s&first=%d" ROOT q (RESULTS_PER_PAGE * (page-1) + 1) let tryParseResultsCount (html:string) = let pattern = "<span class=\"sb_count\" id=\"count\">(?:(?![\d ]+</span>).)*([\d ]+)" match html with | ParseRegexOne pattern [results] -> let ok, v = let res = results.Trim().Replace(" ","").Replace(" ","").Replace(" ","").Replace(",","") UInt32.TryParse(res) if ok then Some v else None | _ -> None let rec parseLinksEntry maxResults results xs rank = match xs with | [url; desc] :: tl when List.length results < maxResults -> if isWordValid desc then let newResults = (Link(desc, url), rank)::results parseLinksEntry maxResults newResults tl (rank+1u) else parseLinksEntry maxResults results tl (rank+1u) | _ -> results let parseLinks (html:string) currentPage maxResults = match html with | ParseRegexMany "<h3><a href=\"([^\"]*)\"[^>]*>((?:(?!</a>).)*)</a></h3>" xs -> let rank = let start = uint32 <| currentPage - 1 let rpp = uint32 RESULTS_PER_PAGE start * rpp parseLinksEntry maxResults [] xs rank | _ -> [] { Name = "Bing" ResultsPerPage = RESULTS_PER_PAGE QueryUrl = queryUrl TryParseResultsCount = tryParseResultsCount ParseLinks = parseLinks }
Yahoo
An example implementation to fetch SEO information from Yahoo.
namespace Seo open System open System.Collections.Generic open System.IO open System.Net open Seo module Yahoo = //===================================== // // Search engine // //===================================== let ROOT = "http://search.yahoo.com" let RESULTS_PER_PAGE = 10 let SEARCH_ENGINE = let queryUrl q page = sprintf "%s/search?p=%s&b=%d" ROOT q (RESULTS_PER_PAGE * (page-1) + 1) let tryParseResultsCount (html:string) = let pattern = "strong id=\"resultCount\">([^<]*)</strong>" match html with | ParseRegexOne pattern [results] -> let ok, v = let res = results.Trim().Replace(",","") UInt32.TryParse(res) if ok then Some v else None | _ -> None let rec parseLinksEntry maxResults results xs rank = match xs with | [url; desc] :: tl when List.length results < maxResults -> if isWordValid desc then let title = desc.Replace("<b>", "").Replace("</b>", "") let newResults = (Link(title, url), rank)::results parseLinksEntry maxResults newResults tl (rank+1u) else parseLinksEntry maxResults results tl (rank+1u) | _ -> results let parseLinks (html:string) currentPage maxResults = match html with | ParseRegexMany "<a class=\"yschttl spt\" href=\"([^\"]*)\"[^>]*>((?:(?!</a>).)*)</a>" xs -> printfn "maxResults:%d parsed:%A" maxResults xs let rank = let start = uint32 <| currentPage - 1 let rpp = uint32 RESULTS_PER_PAGE start * rpp parseLinksEntry maxResults [] xs rank | _ -> [] { Name = "Yahoo" ResultsPerPage = RESULTS_PER_PAGE QueryUrl = queryUrl TryParseResultsCount = tryParseResultsCount ParseLinks = parseLinks }