mirror of
https://github.com/btdig/dhtcrawler2.git
synced 2025-01-19 04:31:37 +00:00
config text segment method [none, simple, rmmseg], default is simple
This commit is contained in:
parent
753486c16a
commit
d64630950e
@ -197,14 +197,7 @@ seg_text(Name, Files) ->
|
||||
FullName = lists:foldl(fun({S, _}, Acc) ->
|
||||
Acc ++ " " ++ S
|
||||
end, Name, Files),
|
||||
seg_text(FullName).
|
||||
|
||||
seg_text(FullName) ->
|
||||
case config:get(use_rmmseg, false) of
|
||||
false -> list_to_binary(FullName);
|
||||
true ->
|
||||
rmmseg:seg_space(list_to_binary(FullName))
|
||||
end.
|
||||
tor_name_seg:seg_text(FullName).
|
||||
-endif.
|
||||
|
||||
% {file1, {name, xx, length, xx}, file2, {name, xx, length, xx}}
|
||||
|
@ -34,7 +34,7 @@ start_standalone(IP, Port, Size) ->
|
||||
start_dep_apps(),
|
||||
tor_download:start_global(),
|
||||
config:start_link("hash_reader.config", fun() -> config_default() end),
|
||||
init_rmmseg(config:get(use_rmmseg, false)),
|
||||
tor_name_seg:init(),
|
||||
% NOTE:
|
||||
Stats = {hash_reader_stats, {hash_reader_stats, start_link, [Size]}, permanent, 2000, worker, [hash_reader_stats]},
|
||||
DownloadStats = {tor_download_stats, {tor_download_stats, start_link, []}, permanent, 2000, worker, [tor_download_stats]},
|
||||
@ -42,14 +42,6 @@ start_standalone(IP, Port, Size) ->
|
||||
DBDateRange = {db_daterange, {db_daterange, start_link, [?DBPOOLNAME]}, permanent, 1000, worker, [db_daterange]},
|
||||
start_link(IP, Port, Size, [Log, DBDateRange, DownloadStats, Stats]).
|
||||
|
||||
init_rmmseg(true) ->
|
||||
io:format("rmmseg is enabled~n", []),
|
||||
rmmseg:init(),
|
||||
rmmseg:load_dicts();
|
||||
init_rmmseg(false) ->
|
||||
io:format("rmmseg is disabled~n", []),
|
||||
ok.
|
||||
|
||||
start_link(IP, Port, Size) ->
|
||||
start_link(IP, Port, Size, []).
|
||||
|
||||
@ -81,5 +73,5 @@ config_default() ->
|
||||
{save_to_db, false},
|
||||
{save_to_file, true},
|
||||
{load_from_db, false},
|
||||
{use_rmmseg, false},
|
||||
{text_seg, simple},
|
||||
{torrent_path, "torrents/"}].
|
||||
|
53
src/tor_name_seg.erl
Normal file
53
src/tor_name_seg.erl
Normal file
@ -0,0 +1,53 @@
|
||||
%%
|
||||
%% tor_name_seg.erl
|
||||
%% Kevin Lynx
|
||||
%% segment torrent name into words
|
||||
%%
|
||||
-module(tor_name_seg).
|
||||
-include("vlog.hrl").
|
||||
-export([init/0, seg_text/1]).
|
||||
|
||||
%% text_seg: [rmmseg, none, simple]
|
||||
init() ->
|
||||
Method = config:get(text_seg, simple),
|
||||
io:format("text segment use `~p`~n", [Method]),
|
||||
do_init(Method).
|
||||
|
||||
seg_text(Text) when is_list(Text) ->
|
||||
Method = config:get(text_seg, simple),
|
||||
do_seg_text(Method, Text).
|
||||
|
||||
do_init(rmmseg) ->
|
||||
rmmseg:init(),
|
||||
rmmseg:load_dicts();
|
||||
|
||||
do_init(none) ->
|
||||
io:format("warning: text segment `none` can NOT search by non-english text~n", []),
|
||||
ok;
|
||||
|
||||
do_init(simple) ->
|
||||
ok;
|
||||
|
||||
do_init(M) ->
|
||||
io:format("unknown text segment method ~p, only support [none, simple, rmmseg]~n", [M]),
|
||||
ok.
|
||||
|
||||
do_seg_text(none, Text) ->
|
||||
list_to_binary(Text);
|
||||
|
||||
do_seg_text(rmmseg, Text) ->
|
||||
rmmseg:seg_space(list_to_binary(Text));
|
||||
|
||||
do_seg_text(simple, Text) ->
|
||||
case string_split:split(Text) of
|
||||
{error, L, D} ->
|
||||
?E(?FMT("string split failed(error): ~p ~p", [L, D])),
|
||||
list_to_binary(Text);
|
||||
{incomplete, L, D} ->
|
||||
?E(?FMT("string split failed(incomplte): ~p ~p", [L, D])),
|
||||
list_to_binary(Text);
|
||||
{ok, R} ->
|
||||
R
|
||||
end.
|
||||
|
||||
|
43
src/transfer.erl
Normal file
43
src/transfer.erl
Normal file
@ -0,0 +1,43 @@
|
||||
-module(transfer).
|
||||
-compile(export_all).
|
||||
-export([start/0]).
|
||||
-define(DBNAME, torrents).
|
||||
-define(COLLNAME, hash).
|
||||
-define(ONE_DAY, 24*60*60).
|
||||
-define(READ_POOL, read_pool).
|
||||
-define(WRITE_POOL, write_pool).
|
||||
-export([start/2]).
|
||||
|
||||
start() ->
|
||||
mongo_sup:start_pool(?READ_POOL, 5, {localhost, 10000}),
|
||||
mongo_sup:start_pool(?WRITE_POOL, 5, {localhost, 27010}),
|
||||
[spawn(?MODULE, start, [DayStart, DayEnd]) ||
|
||||
{DayStart, DayEnd} <- gen_day_ranges()],
|
||||
ok.
|
||||
|
||||
gen_day_ranges() ->
|
||||
Today = time_util:now_day_seconds(),
|
||||
[day_secs_at(Today, Before) || Before <- lists:seq(0, 15)].
|
||||
|
||||
day_secs_at(Today, Before) ->
|
||||
{Today - Before * ?ONE_DAY, Today - Before * ?ONE_DAY + ?ONE_DAY}.
|
||||
|
||||
start(DaySecs, DaySecsMax) ->
|
||||
RConn = mongo_pool:get(?READ_POOL),
|
||||
WConn = mongo_pool:get(?WRITE_POOL),
|
||||
Docs = mongo:do(safe, master, RConn, ?DBNAME, fun() ->
|
||||
Cursor = mongo:find(?COLLNAME, {created_at, {'$gt', DaySecs, '$lt', DaySecsMax}}),
|
||||
mongo_cursor:rest(Cursor)
|
||||
end),
|
||||
case Docs of
|
||||
[] ->
|
||||
ok;
|
||||
_ ->
|
||||
mongo:do(safe, master, WConn, ?DBNAME, fun() ->
|
||||
mongo:insert(?COLLNAME, Docs)
|
||||
end)
|
||||
end,
|
||||
io:format("done at ~p size ~p~n", [DaySecs, length(Docs)]).
|
||||
|
||||
|
||||
|
74
src/utils/name_seger.erl
Normal file
74
src/utils/name_seger.erl
Normal file
@ -0,0 +1,74 @@
|
||||
%%
|
||||
%% name_seger.erl
|
||||
%% Kevin Lynx
|
||||
%% segment name by rmmseg
|
||||
%%
|
||||
-module(name_seger).
|
||||
-export([start/0]).
|
||||
-define(DBNAME, torrents).
|
||||
-define(COLLNAME, hashes).
|
||||
-define(POOLNAME, db_pool).
|
||||
-define(BATCHSIZE, 1000).
|
||||
|
||||
start_dep_apps() ->
|
||||
code:add_path("deps/bson/ebin"),
|
||||
code:add_path("deps/mongodb/ebin"),
|
||||
Apps = [asn1, crypto, public_key, ssl, inets, bson, mongodb],
|
||||
[application:start(App) || App <- Apps].
|
||||
|
||||
start() ->
|
||||
start_dep_apps(),
|
||||
rmmseg:init(),
|
||||
rmmseg:load_dicts(),
|
||||
IP = localhost,
|
||||
Port = 27017,
|
||||
mongo_sup:start_pool(?POOLNAME, 2, {IP, Port}),
|
||||
process().
|
||||
|
||||
process() ->
|
||||
Conn = mongo_pool:get(?POOLNAME),
|
||||
mongo:do(safe, master, Conn, ?DBNAME, fun() ->
|
||||
Cursor = mongo:find(?COLLNAME, {}),
|
||||
process(Cursor, ok, 0)
|
||||
end).
|
||||
|
||||
process(Cursor, ok, Sum) ->
|
||||
print_stats(Sum),
|
||||
Ret = process_one(mongo_cursor:next(Cursor)),
|
||||
process(Cursor, Ret, Sum + 1);
|
||||
|
||||
process(_Cursor, stop, Sum) ->
|
||||
io:format("process done, total ~p~n", [Sum]),
|
||||
stop.
|
||||
|
||||
print_stats(Sum) ->
|
||||
case Sum rem 500 == 0 of
|
||||
true ->
|
||||
io:format(" -> ~p~n", [Sum]);
|
||||
false ->
|
||||
ok
|
||||
end.
|
||||
|
||||
commit(Hash, NameArray) when is_binary(Hash), is_binary(NameArray) ->
|
||||
Cmd = {findAndModify, ?COLLNAME, query, {'_id', Hash},
|
||||
update, {'$set', {name_array, NameArray}}, fields, {'_id', 1},
|
||||
new, false},
|
||||
mongo:command(Cmd).
|
||||
|
||||
process_one({}) ->
|
||||
stop;
|
||||
process_one({Doc}) ->
|
||||
Torrent = db_store_mongo:decode_torrent_item(Doc),
|
||||
{Hash, NameArray} = seg_torrent(Torrent),
|
||||
commit(list_to_binary(Hash), NameArray),
|
||||
ok.
|
||||
|
||||
seg_torrent({single, Hash, {Name, _}, _, _}) ->
|
||||
{Hash, rmmseg:seg_space(list_to_binary(Name))};
|
||||
|
||||
seg_torrent({multi, Hash, {Name, Files}, _, _}) ->
|
||||
FullName = lists:foldl(fun({S, _}, Acc) ->
|
||||
Acc ++ " " ++ S
|
||||
end, Name, Files),
|
||||
{Hash, rmmseg:seg_space(list_to_binary(FullName))}.
|
||||
|
@ -16,4 +16,5 @@ copy ebin\*.* bin\ebin\
|
||||
mkdir bin\tools
|
||||
mkdir bin\tools\db-replset
|
||||
copy tools\db-replset\*.* bin\tools\db-replset\
|
||||
copy priv\*.dic bin\priv\
|
||||
pause
|
||||
|
Loading…
Reference in New Issue
Block a user