mirror of
https://github.com/btdig/dhtcrawler2.git
synced 2025-01-19 04:31:37 +00:00
add cache_indexer, not integrated now, see src/cache_indexer/readme.md
This commit is contained in:
parent
539dd9103b
commit
c78e5d2f9c
@ -34,3 +34,7 @@
|
||||
[debug_info,
|
||||
{i, "include"},
|
||||
{outdir,"ebin"}]}.
|
||||
{'src/cache_indexer/*',
|
||||
[debug_info,
|
||||
{i, "include"},
|
||||
{outdir,"ebin"}]}.
|
||||
|
2
src/cache_indexer/README.md
Normal file
2
src/cache_indexer/README.md
Normal file
@ -0,0 +1,2 @@
|
||||
cache_indexer download torrent index file from torrage.com and build the torrent index in database, so that dhtcrawler2 can check whether a torrent can retrieved from http cache quickly.
|
||||
|
29
src/cache_indexer/db_hash_index.erl
Normal file
29
src/cache_indexer/db_hash_index.erl
Normal file
@ -0,0 +1,29 @@
|
||||
%%
|
||||
%% db_hash_index.erl
|
||||
%% Kevin Lynx
|
||||
%% 07.14.2013
|
||||
%%
|
||||
-module(db_hash_index).
|
||||
-export([insert/2, exist/2]).
|
||||
-define(DBNAME, hash_cache).
|
||||
-define(COLLNAME, hashes).
|
||||
|
||||
insert(Conn, Hash) when is_list(Hash) ->
|
||||
case catch do_insert(Conn, Hash) of
|
||||
{'EXIT', _} -> failed;
|
||||
_ -> ok
|
||||
end.
|
||||
|
||||
do_insert(Conn, Hash) ->
|
||||
Doc = {'_id', list_to_binary(Hash)},
|
||||
mongo:do(safe, master, Conn, ?DBNAME, fun() ->
|
||||
mongo:insert(?COLLNAME, Doc)
|
||||
end).
|
||||
|
||||
exist(Conn, Hash) when is_list(Hash) ->
|
||||
Sel = {'_id', list_to_binary(Hash)},
|
||||
{Doc} = mongo:do(safe, master, Conn, ?DBNAME, fun() ->
|
||||
mongo:find_one(?COLLNAME, Sel)
|
||||
end),
|
||||
Doc == {}.
|
||||
|
160
src/cache_indexer/index_builder.erl
Normal file
160
src/cache_indexer/index_builder.erl
Normal file
@ -0,0 +1,160 @@
|
||||
%%
|
||||
%% index_builder.erl
|
||||
%% Kevin Lynx
|
||||
%% 07.14.2013
|
||||
%%
|
||||
-module(index_builder).
|
||||
-include("vlog.hrl").
|
||||
-compile(export_all).
|
||||
-behaviour(gen_server).
|
||||
-export([init/1,
|
||||
handle_call/3,
|
||||
handle_cast/2,
|
||||
handle_info/2,
|
||||
terminate/2,
|
||||
code_change/3]).
|
||||
-export([start_link/2,
|
||||
start_standalone/1,
|
||||
start_standalone/2,
|
||||
stop/0]).
|
||||
-record(state, {work_on = [], done = [], workers = []}).
|
||||
-define(WORKDIR, "sync/").
|
||||
-define(DBPOOL, index_builder_pool).
|
||||
-define(SYNC_TODAY_INTERVAL, 5*60*1000).
|
||||
|
||||
start_dep_apps() ->
|
||||
code:add_path("deps/bson/ebin"),
|
||||
code:add_path("deps/mongodb/ebin"),
|
||||
code:add_path("deps/ibrowse/ebin"),
|
||||
Apps = [asn1, crypto, public_key, ssl, inets, bson, mongodb],
|
||||
[application:start(App) || App <- Apps],
|
||||
ibrowse:start().
|
||||
|
||||
start_standalone([IP, Port]) ->
|
||||
IPort = list_to_integer(Port),
|
||||
start_standalone(IP, IPort),
|
||||
receive
|
||||
fuck_erl_s_option -> ok
|
||||
end.
|
||||
|
||||
start_standalone(DBIP, DBPort) ->
|
||||
start_dep_apps(),
|
||||
start_link(DBIP, DBPort).
|
||||
|
||||
start_link(DBIP, DBPort) ->
|
||||
gen_server:start_link({local, srv_name()}, ?MODULE, [DBIP, DBPort], []).
|
||||
|
||||
stop() ->
|
||||
gen_server:cast(srv_name(), stop).
|
||||
|
||||
srv_name() ->
|
||||
?MODULE.
|
||||
|
||||
init([DBIP, DBPort]) ->
|
||||
mongo_sup:start_pool(?DBPOOL, 5, {DBIP, DBPort}),
|
||||
filelib:ensure_dir("log/"),
|
||||
vlog:start_link("log/hash_cache.txt", 0),
|
||||
{Done, WorkOn} = load_status(?WORKDIR),
|
||||
?I(?FMT("done ~p, workon ~p", [Done, WorkOn])),
|
||||
NewWorkOn = intersect_new_files(?WORKDIR, Done ++ WorkOn),
|
||||
?I(?FMT("new workon ~p", [NewWorkOn])),
|
||||
save_status(?WORKDIR, Done, NewWorkOn),
|
||||
{ok, #state{work_on = WorkOn ++ NewWorkOn, done = Done}, 0}.
|
||||
|
||||
terminate(_, State) ->
|
||||
mongo_sup:stop_pool(?DBPOOL),
|
||||
vlog:stop(),
|
||||
{ok, State}.
|
||||
|
||||
code_change(_, _, State) ->
|
||||
{ok, State}.
|
||||
|
||||
handle_info(sync_today, State) ->
|
||||
#state{work_on = WorkOn} = State,
|
||||
FileName = index_download:today_file_name(),
|
||||
case lists:member(FileName, WorkOn) of
|
||||
true ->
|
||||
% the file is processing, we should wait
|
||||
?I(?FMT("today index file ~s is processing, wait", [FileName])),
|
||||
schedule_update_today();
|
||||
false ->
|
||||
?I(?FMT("start to download today index file ~s", [FileName])),
|
||||
index_download:download()
|
||||
end,
|
||||
{noreply, State};
|
||||
|
||||
handle_info({sync_torrent_index, ok, FileName}, State) ->
|
||||
#state{workers = Workers, work_on = WorkOn} = State,
|
||||
schedule_update_today(),
|
||||
?I(?FMT("today index file ~s download success", [FileName])),
|
||||
Pid = start_worker(FileName),
|
||||
{noreply, State#state{work_on = [FileName|WorkOn], workers = [Pid|Workers]}};
|
||||
|
||||
handle_info({sync_torrent_index, failed, FileName}, State) ->
|
||||
?W(?FMT("today index file ~s download failed", [FileName])),
|
||||
schedule_update_today(),
|
||||
{noreply, State};
|
||||
|
||||
handle_info({worker_done, Pid, FileName}, State) ->
|
||||
?I(?FMT("worker ~s done", [FileName])),
|
||||
#state{workers = Workers, done = Done, work_on = WorkOn} = State,
|
||||
NewWorkers = lists:delete(Pid, Workers),
|
||||
case length(NewWorkers) of
|
||||
0 ->
|
||||
?I("all index files have been done"),
|
||||
io:format("all index files have been done~n", []);
|
||||
_ ->
|
||||
ok
|
||||
end,
|
||||
NewDone = [FileName|Done],
|
||||
NewWorkOn = lists:delete(FileName, WorkOn),
|
||||
save_status(?WORKDIR, NewDone, NewWorkOn),
|
||||
{noreply, State#state{workers = NewWorkers, done = NewDone, work_on = NewWorkOn}};
|
||||
|
||||
handle_info(timeout, State) ->
|
||||
#state{work_on = WorkOn} = State,
|
||||
Workers = [start_worker(FileName) || FileName <- WorkOn],
|
||||
schedule_update_today(),
|
||||
{noreply, State#state{workers = Workers}}.
|
||||
|
||||
handle_cast(stop, State) ->
|
||||
{stop, normal, State}.
|
||||
|
||||
handle_call(_, _From, State) ->
|
||||
{reply, not_implemented, State}.
|
||||
|
||||
schedule_update_today() ->
|
||||
timer:send_after(?SYNC_TODAY_INTERVAL, sync_today).
|
||||
|
||||
%%
|
||||
start_worker(FileName) ->
|
||||
Conn = mongo_pool:get(?DBPOOL),
|
||||
index_file:start(Conn, FileName).
|
||||
|
||||
intersect_new_files(Dir, Processed) ->
|
||||
Files = index_file_list(Dir),
|
||||
lists:foldl(fun(F, Acc) ->
|
||||
case lists:member(F, Processed) of
|
||||
true -> Acc;
|
||||
false -> [F|Acc]
|
||||
end
|
||||
end, [], Files).
|
||||
|
||||
load_status(Dir) ->
|
||||
case file:consult(Dir ++ "index.sta") of
|
||||
{ok, [Status]} ->
|
||||
Done = proplists:get_value(processed, Status),
|
||||
WorkOn = proplists:get_value(processing, Status),
|
||||
{Done, WorkOn};
|
||||
{error, _} ->
|
||||
{[], []}
|
||||
end.
|
||||
|
||||
save_status(Dir, Done, WorkOn) ->
|
||||
Status = [{processed, Done}, {processing, WorkOn}],
|
||||
file:write_file(Dir ++ "index.sta", io_lib:fwrite("~p.\n",[Status])).
|
||||
|
||||
index_file_list(Dir) ->
|
||||
Files = filelib:wildcard(Dir ++ "*.txt"),
|
||||
Files.
|
||||
|
52
src/cache_indexer/index_download.erl
Normal file
52
src/cache_indexer/index_download.erl
Normal file
@ -0,0 +1,52 @@
|
||||
%%
|
||||
%% index_download.erl
|
||||
%% Kevin Lynx
|
||||
%% 07.14.2013
|
||||
%%
|
||||
-module(index_download).
|
||||
-export([download/0, download/2, today_file_name/0, do_download/2]).
|
||||
-define(DOMAIN, "http://torrage.com").
|
||||
-define(WORKDIR, "sync/").
|
||||
|
||||
download() ->
|
||||
{Date, _} = calendar:local_time(),
|
||||
download(self(), Date).
|
||||
|
||||
download(From, Date) ->
|
||||
spawn_link(?MODULE, do_download, [From, Date]).
|
||||
|
||||
do_download(From, {_, _, _} = Date) ->
|
||||
File = format_file_name(Date),
|
||||
URL = format_file_url(?DOMAIN, File),
|
||||
io:format("download file ~s~n", [URL]),
|
||||
Start = now(),
|
||||
{ok, Code, _, Body} = ibrowse:send_req(URL, [], get, [], [], infinity),
|
||||
Dir = ?WORKDIR,
|
||||
filelib:ensure_dir(Dir),
|
||||
FullFile = Dir ++ File,
|
||||
Ret = try_save(FullFile, Code, Body, timer:now_diff(now(), Start) div 1000),
|
||||
From ! Ret.
|
||||
|
||||
try_save(FullFile, "200", Body, Time) ->
|
||||
file:write_file(FullFile, Body),
|
||||
Size = length(Body),
|
||||
Speed = Size * 1000 div Time,
|
||||
io:format("download index file ~s success ~b bytes, ~b bytes/sec~n", [FullFile, Size, Speed]),
|
||||
{sync_torrent_index, ok, FullFile};
|
||||
try_save(FullFile, Code, _, _) ->
|
||||
io:format("download index file ~s failed ~p~n", [FullFile, Code]),
|
||||
{sync_torrent_index, failed, FullFile}.
|
||||
|
||||
today_file_name() ->
|
||||
{Date, _} = calendar:local_time(),
|
||||
?WORKDIR ++ format_file_name(Date).
|
||||
|
||||
format_file_name({Y, M, 0}) ->
|
||||
lists:flatten(io_lib:format("~b~2..0b.txt", [Y, M]));
|
||||
format_file_name({Y, M, D}) ->
|
||||
lists:flatten(io_lib:format("~b~2..0b~2..0b.txt", [Y, M, D])).
|
||||
|
||||
format_file_url(Domain, File) ->
|
||||
Domain ++ "/sync/" ++ File.
|
||||
|
||||
|
56
src/cache_indexer/index_file.erl
Normal file
56
src/cache_indexer/index_file.erl
Normal file
@ -0,0 +1,56 @@
|
||||
%%
|
||||
%% index_file.erl
|
||||
%% Kevin Lynx
|
||||
%% 07.14.2013
|
||||
%%
|
||||
-module(index_file).
|
||||
-export([start/2]).
|
||||
-export([worker_run/3]).
|
||||
|
||||
start(Conn, FileName) ->
|
||||
spawn_link(?MODULE, worker_run, [self(), Conn, FileName]).
|
||||
|
||||
load_position(Name) ->
|
||||
StatusFile = Name ++ ".sta",
|
||||
Pos = case file:consult(StatusFile) of
|
||||
{ok, [Status]} ->
|
||||
proplists:get_value(position, Status);
|
||||
{error, _} ->
|
||||
0
|
||||
end,
|
||||
Pos.
|
||||
|
||||
save_position(Name, Pos) ->
|
||||
StatusFile = Name ++ ".sta",
|
||||
Status = [{name, Name}, {position, Pos}],
|
||||
file:write_file(StatusFile, io_lib:fwrite("~p.\n",[Status])).
|
||||
|
||||
worker_run(Parent, Conn, FileName) ->
|
||||
Pos = load_position(FileName),
|
||||
io:format("start to process ~s from ~p~n", [FileName, Pos]),
|
||||
{ok, FP} = file:open(FileName, [read]),
|
||||
file:position(FP, Pos),
|
||||
Sum = process_hash(Conn, FileName, FP),
|
||||
Parent ! {worker_done, self(), FileName},
|
||||
file:close(FP),
|
||||
io:format("Index file ~s done, ~p hashes~n", [FileName, Sum]).
|
||||
|
||||
process_hash(Conn, FileName, FP) ->
|
||||
case io:get_line(FP, "") of
|
||||
eof -> 0;
|
||||
Line ->
|
||||
save_hash(Conn, strip_lf(Line)),
|
||||
{ok, Pos} = file:position(FP, cur),
|
||||
save_position(FileName, Pos),
|
||||
1 + process_hash(Conn, FileName, FP)
|
||||
end.
|
||||
|
||||
strip_lf(S) ->
|
||||
lists:sublist(S, length(S) - 1).
|
||||
|
||||
save_hash(Conn, Hash) when length(Hash) == 40 ->
|
||||
db_hash_index:insert(Conn, Hash);
|
||||
|
||||
save_hash(_, _) ->
|
||||
invalid.
|
||||
|
1
src/tor_builder/README.md
Normal file
1
src/tor_builder/README.md
Normal file
@ -0,0 +1 @@
|
||||
tor_builder imports local torrents to database
|
Loading…
Reference in New Issue
Block a user