config text segment method [none, simple, rmmseg], default is simple

This commit is contained in:
Kevin Lynx 2013-07-13 22:26:15 +08:00
parent 753486c16a
commit d64630950e
6 changed files with 174 additions and 18 deletions

View File

@ -197,14 +197,7 @@ seg_text(Name, Files) ->
FullName = lists:foldl(fun({S, _}, Acc) ->
Acc ++ " " ++ S
end, Name, Files),
seg_text(FullName).
seg_text(FullName) ->
case config:get(use_rmmseg, false) of
false -> list_to_binary(FullName);
true ->
rmmseg:seg_space(list_to_binary(FullName))
end.
tor_name_seg:seg_text(FullName).
-endif.
% {file1, {name, xx, length, xx}, file2, {name, xx, length, xx}}

View File

@ -34,7 +34,7 @@ start_standalone(IP, Port, Size) ->
start_dep_apps(),
tor_download:start_global(),
config:start_link("hash_reader.config", fun() -> config_default() end),
init_rmmseg(config:get(use_rmmseg, false)),
tor_name_seg:init(),
% NOTE:
Stats = {hash_reader_stats, {hash_reader_stats, start_link, [Size]}, permanent, 2000, worker, [hash_reader_stats]},
DownloadStats = {tor_download_stats, {tor_download_stats, start_link, []}, permanent, 2000, worker, [tor_download_stats]},
@ -42,14 +42,6 @@ start_standalone(IP, Port, Size) ->
DBDateRange = {db_daterange, {db_daterange, start_link, [?DBPOOLNAME]}, permanent, 1000, worker, [db_daterange]},
start_link(IP, Port, Size, [Log, DBDateRange, DownloadStats, Stats]).
init_rmmseg(true) ->
io:format("rmmseg is enabled~n", []),
rmmseg:init(),
rmmseg:load_dicts();
init_rmmseg(false) ->
io:format("rmmseg is disabled~n", []),
ok.
start_link(IP, Port, Size) ->
start_link(IP, Port, Size, []).
@ -81,5 +73,5 @@ config_default() ->
{save_to_db, false},
{save_to_file, true},
{load_from_db, false},
{use_rmmseg, false},
{text_seg, simple},
{torrent_path, "torrents/"}].

53
src/tor_name_seg.erl Normal file
View File

@ -0,0 +1,53 @@
%%
%% tor_name_seg.erl
%% Kevin Lynx
%% segment torrent name into words
%%
-module(tor_name_seg).
-include("vlog.hrl").
-export([init/0, seg_text/1]).
%% text_seg: [rmmseg, none, simple]
init() ->
Method = config:get(text_seg, simple),
io:format("text segment use `~p`~n", [Method]),
do_init(Method).
seg_text(Text) when is_list(Text) ->
Method = config:get(text_seg, simple),
do_seg_text(Method, Text).
do_init(rmmseg) ->
rmmseg:init(),
rmmseg:load_dicts();
do_init(none) ->
io:format("warning: text segment `none` can NOT search by non-english text~n", []),
ok;
do_init(simple) ->
ok;
do_init(M) ->
io:format("unknown text segment method ~p, only support [none, simple, rmmseg]~n", [M]),
ok.
do_seg_text(none, Text) ->
list_to_binary(Text);
do_seg_text(rmmseg, Text) ->
rmmseg:seg_space(list_to_binary(Text));
do_seg_text(simple, Text) ->
case string_split:split(Text) of
{error, L, D} ->
?E(?FMT("string split failed(error): ~p ~p", [L, D])),
list_to_binary(Text);
{incomplete, L, D} ->
?E(?FMT("string split failed(incomplte): ~p ~p", [L, D])),
list_to_binary(Text);
{ok, R} ->
R
end.

43
src/transfer.erl Normal file
View File

@ -0,0 +1,43 @@
-module(transfer).
-compile(export_all).
-export([start/0]).
-define(DBNAME, torrents).
-define(COLLNAME, hash).
-define(ONE_DAY, 24*60*60).
-define(READ_POOL, read_pool).
-define(WRITE_POOL, write_pool).
-export([start/2]).
start() ->
mongo_sup:start_pool(?READ_POOL, 5, {localhost, 10000}),
mongo_sup:start_pool(?WRITE_POOL, 5, {localhost, 27010}),
[spawn(?MODULE, start, [DayStart, DayEnd]) ||
{DayStart, DayEnd} <- gen_day_ranges()],
ok.
gen_day_ranges() ->
Today = time_util:now_day_seconds(),
[day_secs_at(Today, Before) || Before <- lists:seq(0, 15)].
day_secs_at(Today, Before) ->
{Today - Before * ?ONE_DAY, Today - Before * ?ONE_DAY + ?ONE_DAY}.
start(DaySecs, DaySecsMax) ->
RConn = mongo_pool:get(?READ_POOL),
WConn = mongo_pool:get(?WRITE_POOL),
Docs = mongo:do(safe, master, RConn, ?DBNAME, fun() ->
Cursor = mongo:find(?COLLNAME, {created_at, {'$gt', DaySecs, '$lt', DaySecsMax}}),
mongo_cursor:rest(Cursor)
end),
case Docs of
[] ->
ok;
_ ->
mongo:do(safe, master, WConn, ?DBNAME, fun() ->
mongo:insert(?COLLNAME, Docs)
end)
end,
io:format("done at ~p size ~p~n", [DaySecs, length(Docs)]).

74
src/utils/name_seger.erl Normal file
View File

@ -0,0 +1,74 @@
%%
%% name_seger.erl
%% Kevin Lynx
%% segment name by rmmseg
%%
-module(name_seger).
-export([start/0]).
-define(DBNAME, torrents).
-define(COLLNAME, hashes).
-define(POOLNAME, db_pool).
-define(BATCHSIZE, 1000).
start_dep_apps() ->
code:add_path("deps/bson/ebin"),
code:add_path("deps/mongodb/ebin"),
Apps = [asn1, crypto, public_key, ssl, inets, bson, mongodb],
[application:start(App) || App <- Apps].
start() ->
start_dep_apps(),
rmmseg:init(),
rmmseg:load_dicts(),
IP = localhost,
Port = 27017,
mongo_sup:start_pool(?POOLNAME, 2, {IP, Port}),
process().
process() ->
Conn = mongo_pool:get(?POOLNAME),
mongo:do(safe, master, Conn, ?DBNAME, fun() ->
Cursor = mongo:find(?COLLNAME, {}),
process(Cursor, ok, 0)
end).
process(Cursor, ok, Sum) ->
print_stats(Sum),
Ret = process_one(mongo_cursor:next(Cursor)),
process(Cursor, Ret, Sum + 1);
process(_Cursor, stop, Sum) ->
io:format("process done, total ~p~n", [Sum]),
stop.
print_stats(Sum) ->
case Sum rem 500 == 0 of
true ->
io:format(" -> ~p~n", [Sum]);
false ->
ok
end.
commit(Hash, NameArray) when is_binary(Hash), is_binary(NameArray) ->
Cmd = {findAndModify, ?COLLNAME, query, {'_id', Hash},
update, {'$set', {name_array, NameArray}}, fields, {'_id', 1},
new, false},
mongo:command(Cmd).
process_one({}) ->
stop;
process_one({Doc}) ->
Torrent = db_store_mongo:decode_torrent_item(Doc),
{Hash, NameArray} = seg_torrent(Torrent),
commit(list_to_binary(Hash), NameArray),
ok.
seg_torrent({single, Hash, {Name, _}, _, _}) ->
{Hash, rmmseg:seg_space(list_to_binary(Name))};
seg_torrent({multi, Hash, {Name, Files}, _, _}) ->
FullName = lists:foldl(fun({S, _}, Acc) ->
Acc ++ " " ++ S
end, Name, Files),
{Hash, rmmseg:seg_space(list_to_binary(FullName))}.

View File

@ -16,4 +16,5 @@ copy ebin\*.* bin\ebin\
mkdir bin\tools
mkdir bin\tools\db-replset
copy tools\db-replset\*.* bin\tools\db-replset\
copy priv\*.dic bin\priv\
pause