Home > erlang, programming > Scraping google results in Erlang

Scraping google results in Erlang

Currently from both legal and technical reasons my full music albums search app (to be published soon) is using external search indices rather building its own.
Among those I plan to use is google search engine. The app needs it to get links to pages containing links to mp3 streams my app is passing to the user.
So effectively I’m going to build google search result scrapper. Here is how it could look like:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
-module(google_scrapper).
 
-compile(export_all).
 
-define(GOOGLE_URL, "http://www.google.co.uk/search?hl=en&btnG=Search&meta=&q=").
 
start() -> inets:start().
 
fetch_google_results(Q) ->
  % In case of redirect lets erlang take care of this for us
  HTTPOptions = [{autoredirect, true}],
  % We want binary as a result
  Options = [{body_format, binary}],
  Headers = [
    % Let's be Firefox ;)
    {"User-Agent", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.0.10) Gecko/2009042315 Firefox/3.0.10"},
    {"Accept" , "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"},
    % I want the result be UTF-8 encoded
    {"Accept-Charset", "utf-8;q=0.7,*;q=0.7"}
  ],
  Request = {?GOOGLE_URL++url_encoder:encode(Q), Headers},
  case http:request(get, Request, HTTPOptions, Options) of
    {ok, {{"HTTP/1.1",200,"OK"}, _, Body}} -> Body;
    {error,Error} -> {error,Error}
  end.
 
 
parse(B, RE, Fun) ->
  case re:run(B, RE, [global, caseless, unicode, dotall, multiline, {capture, all, binary}]) of
    {match, Matches} ->
      lists:map(
        fun(Match) -> Fun(Match) end,
      Matches);
    nomatch -> []
  end.
 
parse_google_results(B) ->
  RE = "<\!--m-->(.*?)<\!--n-->",
  parse(B, RE, fun parse_google_result/1).
 
 
parse_google_result(GResult) ->
  RE = "<li class=g.*?<h3.*?<a href="(.*?)".*?>(.*?)</a>",
  Fun = fun([_,Href,Name]) ->
    {Href,Name}
  end,
  parse(GResult, RE, Fun).

As you could notice I’ve used url_encoder:encode/1 function. The standard OTP doesn’t contain one but you can get it either from gist.github.com/127917 ,
ibrowse or
yaws

You can use it by typing:

google_scrapper:start().
B = google_scrapper:fetch_google_results("google images copyright").
R = google_scrapper:parse_google_results(B).

The result should be similar to:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
[
...
[{<<"http://www.lawdit.co.uk/reading_room/room/view_article.asp?name=../articles/Google%20Sued%20"...>>,
<<"<em>Google</em> Sued for <em>Copyright</em> Infringement through Use of &#39;<em>Google<"...>>},
{<<"http://www.lawdit.co.uk/reading_room/room/view_article.asp?name=../articles/Google%20Sue"...>>,
<<"<em>Google</em> Sued for <em>Copyright</em> Infringement through Use of &#39;<em>Goo"...>>}],
[{<<"http://www.mahalo.com/Google_Images_Copyright_Infringement">>,
<<"<em>Google Images Copyright</em> Infringement - Mahalo">>},
{<<"http://www.mahalo.com/Google_Images_Copyright_Infringement">>,
<<"<em>Google Images Copyright</em> Infringement - Mahalo">>}],
[{<<"http://www.goossip.com/2008/10/google-images-loses-two-copyright-cases.html">>,
<<"<em>Google Images</em> loses two <em>copyright</em> cases in Germany - Goossip <"...>>},
{<<"http://www.goossip.com/2008/10/google-images-loses-two-copyright-cases.html">>,
<<"<em>Google Images</em> loses two <em>copyright</em> cases in Germany - Gooss"...>>}]
…]
Categories: erlang, programming Tags:
  1. Bob Roberts
    September 28th, 2009 at 10:42 | #1

    Good example. GOOG’s been returning javascript as body of http request to all updated browsers, so I think you’d have to use IE 6 user-agent string for this to return 10 search results:

    Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)

    (also don’t need to “-compile(export_all).”

  1. June 13th, 2009 at 05:33 | #1