Archive

Posts Tagged ‘erlang scrapping TOR google’

Scraping google results in Erlang - the sequel (how to do it securely)

June 13th, 2009 Daniello No comments

In the last post I presented how to use google search service from erlang. But what if we want to do it securely and anonymously? We can use TOR. We can use TOR for that purpose even without installing TOR on our machine by using scroogle.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
-module(scroogle_scrapper).
 
-compile(export_all).
 
-define(SCROOGLE_URL, "https://ssl.scroogle.org/cgi-bin/nbbw.cgi").
-define(SCROOGLE_PEM, "[PATH_TO_SSL_SCROOGLE_ORG_PEM_CERTIFICATE]").
 
start() ->
  inets:start(),
  ssl:start().
 
fetch_scroogle_results(Q) ->
  % We want binary as a result
  Options = [{body_format, binary}],
  HTTPOptions = [{ssl, [{cacertfile, ?SCROOGLE_PEM},{verify, 2}]}],
  ReqBody = "Gw="++url_encoder:encode(Q)++"&n=1",
  Request = {?SCROOGLE_URL, [], "application/x-www-form-urlencoded", ReqBody},
  case http:request(post, Request, HTTPOptions, Options) of
    {ok, {{"HTTP/1.1",200,"OK"}, _, Body}} -> Body;
    {error,Error} -> {error,Error}
  end.
 
 
parse(B, RE, Fun) ->
  case re:run(B, RE, [global, caseless, unicode, dotall, multiline, {capture, all, binary}]) of
    {match, Matches} ->
      lists:map(
        fun(Match) -> Fun(Match) end,
      Matches);
    nomatch -> []
  end.
 
parse_results(B) ->
  RE = "[0-9]+?\.[[:space:]]+(<a href=.+?</a>)",
  parse(B, RE, fun parse_result/1).
 
 
parse_result(GResult) ->
  RE = "<a href="(.*?)".*?>(.*?)</a>",
  Fun = fun([_,Href,Name]) ->
    {Href,Name}
  end,
  parse(GResult, RE, Fun).

You can use it as described in previous post.