diff --git a/src/db_store_mongo.erl b/src/db_store_mongo.erl index c09d7c6..276760b 100644 --- a/src/db_store_mongo.erl +++ b/src/db_store_mongo.erl @@ -197,14 +197,7 @@ seg_text(Name, Files) -> FullName = lists:foldl(fun({S, _}, Acc) -> Acc ++ " " ++ S end, Name, Files), - seg_text(FullName). - -seg_text(FullName) -> - case config:get(use_rmmseg, false) of - false -> list_to_binary(FullName); - true -> - rmmseg:seg_space(list_to_binary(FullName)) - end. + tor_name_seg:seg_text(FullName). -endif. % {file1, {name, xx, length, xx}, file2, {name, xx, length, xx}} diff --git a/src/hash_reader/hash_reader_sup.erl b/src/hash_reader/hash_reader_sup.erl index 7c0ec8a..990489a 100644 --- a/src/hash_reader/hash_reader_sup.erl +++ b/src/hash_reader/hash_reader_sup.erl @@ -34,7 +34,7 @@ start_standalone(IP, Port, Size) -> start_dep_apps(), tor_download:start_global(), config:start_link("hash_reader.config", fun() -> config_default() end), - init_rmmseg(config:get(use_rmmseg, false)), + tor_name_seg:init(), % NOTE: Stats = {hash_reader_stats, {hash_reader_stats, start_link, [Size]}, permanent, 2000, worker, [hash_reader_stats]}, DownloadStats = {tor_download_stats, {tor_download_stats, start_link, []}, permanent, 2000, worker, [tor_download_stats]}, @@ -42,14 +42,6 @@ start_standalone(IP, Port, Size) -> DBDateRange = {db_daterange, {db_daterange, start_link, [?DBPOOLNAME]}, permanent, 1000, worker, [db_daterange]}, start_link(IP, Port, Size, [Log, DBDateRange, DownloadStats, Stats]). -init_rmmseg(true) -> - io:format("rmmseg is enabled~n", []), - rmmseg:init(), - rmmseg:load_dicts(); -init_rmmseg(false) -> - io:format("rmmseg is disabled~n", []), - ok. - start_link(IP, Port, Size) -> start_link(IP, Port, Size, []). @@ -81,5 +73,5 @@ config_default() -> {save_to_db, false}, {save_to_file, true}, {load_from_db, false}, - {use_rmmseg, false}, + {text_seg, simple}, {torrent_path, "torrents/"}]. diff --git a/src/tor_name_seg.erl b/src/tor_name_seg.erl new file mode 100644 index 0000000..3241fb7 --- /dev/null +++ b/src/tor_name_seg.erl @@ -0,0 +1,53 @@ +%% +%% tor_name_seg.erl +%% Kevin Lynx +%% segment torrent name into words +%% +-module(tor_name_seg). +-include("vlog.hrl"). +-export([init/0, seg_text/1]). + +%% text_seg: [rmmseg, none, simple] +init() -> + Method = config:get(text_seg, simple), + io:format("text segment use `~p`~n", [Method]), + do_init(Method). + +seg_text(Text) when is_list(Text) -> + Method = config:get(text_seg, simple), + do_seg_text(Method, Text). + +do_init(rmmseg) -> + rmmseg:init(), + rmmseg:load_dicts(); + +do_init(none) -> + io:format("warning: text segment `none` can NOT search by non-english text~n", []), + ok; + +do_init(simple) -> + ok; + +do_init(M) -> + io:format("unknown text segment method ~p, only support [none, simple, rmmseg]~n", [M]), + ok. + +do_seg_text(none, Text) -> + list_to_binary(Text); + +do_seg_text(rmmseg, Text) -> + rmmseg:seg_space(list_to_binary(Text)); + +do_seg_text(simple, Text) -> + case string_split:split(Text) of + {error, L, D} -> + ?E(?FMT("string split failed(error): ~p ~p", [L, D])), + list_to_binary(Text); + {incomplete, L, D} -> + ?E(?FMT("string split failed(incomplte): ~p ~p", [L, D])), + list_to_binary(Text); + {ok, R} -> + R + end. + + diff --git a/src/transfer.erl b/src/transfer.erl new file mode 100644 index 0000000..8c95731 --- /dev/null +++ b/src/transfer.erl @@ -0,0 +1,43 @@ +-module(transfer). +-compile(export_all). +-export([start/0]). +-define(DBNAME, torrents). +-define(COLLNAME, hash). +-define(ONE_DAY, 24*60*60). +-define(READ_POOL, read_pool). +-define(WRITE_POOL, write_pool). +-export([start/2]). + +start() -> + mongo_sup:start_pool(?READ_POOL, 5, {localhost, 10000}), + mongo_sup:start_pool(?WRITE_POOL, 5, {localhost, 27010}), + [spawn(?MODULE, start, [DayStart, DayEnd]) || + {DayStart, DayEnd} <- gen_day_ranges()], + ok. + +gen_day_ranges() -> + Today = time_util:now_day_seconds(), + [day_secs_at(Today, Before) || Before <- lists:seq(0, 15)]. + +day_secs_at(Today, Before) -> + {Today - Before * ?ONE_DAY, Today - Before * ?ONE_DAY + ?ONE_DAY}. + +start(DaySecs, DaySecsMax) -> + RConn = mongo_pool:get(?READ_POOL), + WConn = mongo_pool:get(?WRITE_POOL), + Docs = mongo:do(safe, master, RConn, ?DBNAME, fun() -> + Cursor = mongo:find(?COLLNAME, {created_at, {'$gt', DaySecs, '$lt', DaySecsMax}}), + mongo_cursor:rest(Cursor) + end), + case Docs of + [] -> + ok; + _ -> + mongo:do(safe, master, WConn, ?DBNAME, fun() -> + mongo:insert(?COLLNAME, Docs) + end) + end, + io:format("done at ~p size ~p~n", [DaySecs, length(Docs)]). + + + diff --git a/src/utils/name_seger.erl b/src/utils/name_seger.erl new file mode 100644 index 0000000..84eeaf4 --- /dev/null +++ b/src/utils/name_seger.erl @@ -0,0 +1,74 @@ +%% +%% name_seger.erl +%% Kevin Lynx +%% segment name by rmmseg +%% +-module(name_seger). +-export([start/0]). +-define(DBNAME, torrents). +-define(COLLNAME, hashes). +-define(POOLNAME, db_pool). +-define(BATCHSIZE, 1000). + +start_dep_apps() -> + code:add_path("deps/bson/ebin"), + code:add_path("deps/mongodb/ebin"), + Apps = [asn1, crypto, public_key, ssl, inets, bson, mongodb], + [application:start(App) || App <- Apps]. + +start() -> + start_dep_apps(), + rmmseg:init(), + rmmseg:load_dicts(), + IP = localhost, + Port = 27017, + mongo_sup:start_pool(?POOLNAME, 2, {IP, Port}), + process(). + +process() -> + Conn = mongo_pool:get(?POOLNAME), + mongo:do(safe, master, Conn, ?DBNAME, fun() -> + Cursor = mongo:find(?COLLNAME, {}), + process(Cursor, ok, 0) + end). + +process(Cursor, ok, Sum) -> + print_stats(Sum), + Ret = process_one(mongo_cursor:next(Cursor)), + process(Cursor, Ret, Sum + 1); + +process(_Cursor, stop, Sum) -> + io:format("process done, total ~p~n", [Sum]), + stop. + +print_stats(Sum) -> + case Sum rem 500 == 0 of + true -> + io:format(" -> ~p~n", [Sum]); + false -> + ok + end. + +commit(Hash, NameArray) when is_binary(Hash), is_binary(NameArray) -> + Cmd = {findAndModify, ?COLLNAME, query, {'_id', Hash}, + update, {'$set', {name_array, NameArray}}, fields, {'_id', 1}, + new, false}, + mongo:command(Cmd). + +process_one({}) -> + stop; +process_one({Doc}) -> + Torrent = db_store_mongo:decode_torrent_item(Doc), + {Hash, NameArray} = seg_torrent(Torrent), + commit(list_to_binary(Hash), NameArray), + ok. + +seg_torrent({single, Hash, {Name, _}, _, _}) -> + {Hash, rmmseg:seg_space(list_to_binary(Name))}; + +seg_torrent({multi, Hash, {Name, Files}, _, _}) -> + FullName = lists:foldl(fun({S, _}, Acc) -> + Acc ++ " " ++ S + end, Name, Files), + {Hash, rmmseg:seg_space(list_to_binary(FullName))}. + diff --git a/tools/create_bin.bat b/tools/create_bin.bat index 60386d1..2606922 100644 --- a/tools/create_bin.bat +++ b/tools/create_bin.bat @@ -16,4 +16,5 @@ copy ebin\*.* bin\ebin\ mkdir bin\tools mkdir bin\tools\db-replset copy tools\db-replset\*.* bin\tools\db-replset\ +copy priv\*.dic bin\priv\ pause