$ wget http://archive.org/download/testWARCfiles/WIDE-20110225183219005-04371-13730~crawl301.us.archive.org~9443.warc.gz
c:\> wget.exe http://archive.org/download/testWARCfiles/WIDE-20110225183219005-04371-13730~crawl301.us.archive.org~9443.warc.gz
In this first gem, we’ll use some of the WSDK’s primitives to count the number of records inside the WARC file we’ve just downloaded.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | %% copy the following into a file called: "count_records.erl"
-module(count_records).
-export([from_warc/1]).
from_warc(Filename) ->
{ok, _, Handle} = wsdk_warc:read(Filename),
count(Handle, 1). %% start counting from 1
count(Handle, Cnt) ->
case wsdk_warc:read(Handle) of %% read next WARC record if any!
{ok, _, Handle1} ->
count(Handle1, Cnt + 1); %% found a new record, increment Cnt
eof ->
wsdk_warc:close(Handle), %% we're done, close the WARC file handle
Cnt
end.
|
c:\> erlc count_records.erl
c:\> werl -s wsdk start
Erlang R15B01 (erts-5.9.1) [source] [64-bit] [smp:2:2] [async-threads:0] [kernel-poll:false]
Eshell V5.9.1 (abort with ^G)
1> count_records:from_warc("WIDE-20110225183219005-04371-13730~crawl301.us.archive.org~9443.warc.gz").
64188
2> q().
We found 64188 WARC records.
Lines 1,2: declare a new module called count_records which exports one public function called from_warc/1.
The function from_warc takes only one argument, a WARC filename.
Line 9: is the function body’s of the counting (recursive) loop.
Line 13: indicates the end of WARC. We’re done and there’s no more records to count. Just return Cnt.
10 lines of code ... too much for just counting
Let’s see!
$ gzip -dc WIDE-20110225183219005-04371-13730~crawl301.us.archive.org~9443.warc.gz > WIDE-20110225183219005.warc
$ erl -s wsdk start
Erlang R15B01 (erts-5.9.1) [source] [64-bit] [smp:2:2] [async-threads:0] [kernel-poll:false]
Eshell V5.9.1 (abort with ^G)
1> count_records:from_warc("WIDE-20110225183219005.warc").
64188
2> q().
The result is the same: 64188 records found.
Exercise
Let see how to proceed:
1 2 3 4 5 6 7 8 9 | %% copy the following into a file called: "record_status.erl"
-module(record_status).
-export([valid_or_not/1]).
valid_or_not(Filename) ->
{ok, Record, Handle} = wsdk_warc:read(Filename),
Status = wsdk_record:is_valid(Record), %% check record's validity
ok = wsdk_warc:close(Handle),
Status =:= ok.
|
$ erlc record_status.erl
$ erl -s wsdk start
Erlang R15B01 (erts-5.9.1) [source] [64-bit] [smp:2:2] [async-threads:0] [kernel-poll:false]
Eshell V5.9.1 (abort with ^G)
1> record_status:valid_or_not("WIDE-20110225183219005-04371-13730~crawl301.us.archive.org~9443.warc.gz").
true
2> q().
So yes, the first record is a valid WARC record.
Line 9: return the result.
Exercise
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | %% copy the following into a file called: "small_big.erl"
-module(small_big).
-export([find/1]).
%% useful field selectors
-include_lib("wsdk/include/wsdk.hrl").
%% macro which returns the record's length
-define(LENGTH(Record), wsdk_record:get(Record, ?'Content-Length')).
find(Filename) ->
{ok, Record, Handle} = wsdk_warc:read(Filename),
find(Handle, Record, Record).
find(Handle, Small, Big) ->
SmallLength = ?LENGTH(Small),
BigLength = ?LENGTH(Big),
case wsdk_warc:read(Handle) of
{ok, Record, Handle1} ->
Length = ?LENGTH(Record), %% current record size
if
Length > BigLength -> find(Handle1, Small, Record);
Length < SmallLength -> find(Handle1, Record, Big);
true -> find(Handle1, Small, Big)
end;
eof ->
ok = wsdk_warc:close(Handle),
[{smallest, SmallLength}, {biggest, BigLength}]
end.
|
c:\> erlc small_big.erl
c:\> werl -s wsdk start
Erlang R15B01 (erts-5.9.1) [source] [64-bit] [smp:2:2] [async-threads:0] [kernel-poll:false]
Eshell V5.9.1 (abort with ^G)
1> small_big:find("WIDE-20110225183219005-04371-13730~crawl301.us.archive.org~9443.warc.gz").
[{smallest,52},{biggest,222794980}]
2> q().
The body of this new gem is very similar to one at 1. Count WARC records.
Exercise 1
Exercise 2
Exercise 3
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | %% copy the following into a file called: "record_info.erl"
-module(record_info).
-export([print/2]).
%% useful field selectors
-include_lib("wsdk/include/wsdk.hrl").
%% macro for field selection
-define(FIELD(Record, Selector), wsdk_record:get(Record, Selector)).
print(Filename, Offset) ->
{ok, Record, Handle} = wsdk_warc:read(Filename, Offset), %% open the WARC and move to an offset
Info = [
{type, ?FIELD(Record, ?'WARC-Type')},
{date, ?FIELD(Record, ?'WARC-Date')},
{vsn, ?FIELD(Record, ?'WARC-Version')},
{id, ?FIELD(Record, ?'WARC-Record-ID')},
{uri, ?FIELD(Record, ?'WARC-Target-URI')}
],
ok = wsdk_warc:close(Handle),
Info.
|
$ erlc record_info.erl
$ erl -s wsdk start
Erlang R15B01 (erts-5.9.1) [source] [64-bit] [smp:2:2] [async-threads:0] [kernel-poll:false]
Eshell V5.9.1 (abort with ^G)
1> record_info:print("WIDE-20110225183219005-04371-13730~crawl301.us.archive.org~9443.warc.gz", 5970260).
[{type,<<"response">>},
{date,{{2011,2,25},{18,33,19}}},
{vsn,<<"1.0">>},
{id,<<"<urn:uuid:7c8beabf-0cae-47bd-928a-0625fbe5a306>">>},
{uri,<<"http://sotis-it.ru/index.php?option=com_fireboard&Itemid=79&func=view&catid=19&id=1708">>}]
2> q().
Line 20: the result is returned.
Exercise
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | %% copy the following into a file called: "record_payload.erl"
-module(record_payload).
-export([dump/2]).
dump(Filename, Offset) ->
{ok, Record, Handle} = wsdk_warc:read(Filename, Offset), %% open the WARC and move to an offset
dump(Record),
ok = wsdk_warc:close(Handle).
dump(Record) ->
case wsdk_record:payload(Record) of
{ok, Chunk, Record1} -> %% got a chunk, print it out
io:format("~p", [Chunk]),
dump(Record1);
eof -> %% no more chunks to read from the payload
ok
end.
|
$ erlc record_payload.erl
$ erl -s wsdk start
Erlang R15B01 (erts-5.9.1) [source] [64-bit] [smp:2:2] [async-threads:0] [kernel-poll:false]
Eshell V5.9.1 (abort with ^G)
1> record_payload:dump("WIDE-20110225183219005-04371-13730~crawl301.us.archive.org~9443.warc.gz", 4781045).
<<"GET /athletics/events/volleyball-hollins-trimatch HTTP/1.0\r\nUser-Agent: Mozilla/5.0 (compatible; archive.org_bot +http://www.archive.org/details/archive.org_bot)\r\nConnection: close\r\nReferer: http://www.salem.edu/sitemap\r\nHost: www.salem.edu\r\n\r\n">>
2> q().
So, this is an HTTP GET request.
Line 15: the call returns the tag eof, we’re done.
Exercise
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | %% copy the following into a UTF-8 encoded file called: "new_warc.erl"
-module(new_warc).
-export([create/1]).
%% useful field selectors
-include_lib("wsdk/include/wsdk.hrl").
create(Filename) ->
%% Chinese: wsdk_utf8:to_binary([35486,25991,25945,23416,12539,35821,25991,25945,23398])
{ok, UTF8Filename} = wsdk_utf8:to_binary("語文教學・语文教学"),
Record1 = wsdk_record:new([ %% WARC Info record
{?'WARC-Type', 'warcinfo'},
{?'WARC-Version', '1.0'},
{?'WARC-Date', {{2012,8,17},{22,57,14}}},
{?'WARC-Filename', UTF8Filename},
{?'WARC-Record-ID', <<"urn:uuid:35f02b38-eb19-4f0d-86e4-bfe95815069c">>}
]),
Record2 = wsdk_record:new([ %% WARC Response record
{?'WARC-Version', '0.17'}, %% notice the version number 0.17
{?'Content-Type', <<"application/http; msgtype=response">>},
{?'WARC-Date', {{2012,8,18},{1,25,33}}},
{?'WARC-Record-ID', wsdk_uuid:urn()}, %% generate a URN
{?'WARC-Type', 'response'},
{?'Payload-Type', 'bytes'},
{?'Payload-Source', <<"this is a nice payload">>}
]),
{ok, WARC} = wsdk_warc:write(Filename), %% create a new plain WARC file
{ok, WARC1} = wsdk_warc:write(WARC, Record1), %% write Record1
{ok, WARC2} = wsdk_warc:write(WARC1, Record2), %% write Record2
ok = wsdk_warc:close(WARC2).
|
c:\> erlc new_warc.erl
c:\> werl -s wsdk start
Erlang R15B01 (erts-5.9.1) [source] [64-bit] [smp:2:2] [async-threads:0] [kernel-poll:false]
Eshell V5.9.1 (abort with ^G)
1> new_warc:create("foo.warc").
2> q().
Have a look to the newly generated WARC file foo.warc in the current directory:
WARC/1.0
Content-Length: 0
WARC-Type: warcinfo
WARC-Date: 2012-08-17T22:57:14Z
WARC-Record-ID: urn:uuid:35f02b38-eb19-4f0d-86e4-bfe95815069c
WARC-Filename: 語文教學・语文教学
WARC/0.17
Content-Length: 22
WARC-Type: response
WARC-Date: 2012-08-18T01:25:33Z
WARC-Record-ID: urn:uuid:f8c7505b-95f2-11e1-7163-3ae800000024
Content-Type: application/http; msgtype=response
this is a nice payload
Exercise 1
Exercise 2
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | %% copy the following into a file called: "record_mime.erl"
-module(record_mime).
-export([stats/1]).
%% useful field selectors
-include_lib("wsdk/include/wsdk.hrl").
stats(Filename) ->
{ok, Record, Handle} = wsdk_warc:read(Filename),
Mime = wsdk_record:get(Record, ?'Content-Type'),
Categories = stats(Handle, match(Mime, {0,0,0,0,0})),
ok = wsdk_warc:close(Handle),
Categories.
stats(Handle, Categories) ->
case wsdk_warc:read(Handle) of %% read next WARC record if any!
{ok, Record, Handle1} ->
Mime = wsdk_record:get(Record, ?'Content-Type'),
stats(Handle1, match(Mime, Categories));
eof ->
Categories %% no more records, return categories!
end.
%% refine (add more) match patterns below to be more precise
match(<<"text/", _/binary>>, {Text, Image, Audio, Video, Other}) ->
{Text + 1, Image, Audio, Video, Other};
match(<<"image/", _/binary>>, {Text, Image, Audio, Video, Other}) ->
{Text, Image + 1, Audio, Video, Other};
match(<<"audio/", _/binary>>, {Text, Image, Audio, Video, Other}) ->
{Text, Image, Audio + 1, Video, Other};
match(<<"video/", _/binary>>, {Text, Image, Audio, Video, Other}) ->
{Text, Image, Audio, Video + 1, Other};
match(_, {Text, Image, Audio, Video, Other}) ->
{Text, Image, Audio, Video, Other + 1}.
|
$ erlc record_mime.erl
$ erl -s wsdk start
Erlang R15B01 (erts-5.9.1) [source] [64-bit] [smp:2:2] [async-threads:0] [kernel-poll:false]
Eshell V5.9.1 (abort with ^G)
1> record_mime:stats("WIDE-20110225183219005-04371-13730~crawl301.us.archive.org~9443.warc.gz").
{26,0,0,0,64162}
2> q().
Line 11: set the Categories to {0,0,0,0,0} as initial value.
Line 25: if the MIME-Type starts with <<”text/”>>, increment Text category.
Line 27: if the MIME-Type starts with <<”image/”>>, increment Image category.
...
Line 33: this one is a catch all, we increment Other category.
Exercise 1
Exercise 2