add rmmseg, with a pre-compiled win32 version

This commit is contained in:
Kevin Lynx 2013-07-13 21:53:03 +08:00
parent 77893f0759
commit 753486c16a
7 changed files with 133019 additions and 9 deletions

12638
priv/chars.dic Normal file

File diff suppressed because it is too large Load Diff

120308
priv/words.dic Normal file

File diff suppressed because it is too large Load Diff

View File

@ -18,6 +18,7 @@
search_recently/2,
search_newest_top/3,
search/2]).
-export([decode_torrent_item/1]).
-compile(export_all).
-define(DBNAME, torrents).
-define(COLLNAME, hashes).
@ -183,15 +184,7 @@ create_torrent_desc(Conn, Hash, Name, Length, Announce, Files) ->
files, encode_file_list(Files)}.
-else.
create_torrent_desc(_Conn, Hash, Name, Length, Announce, Files) ->
NameArray = case string_split:split(Name) of
{error, L, D} ->
?E(?FMT("string split failed(error): ~p ~p", [L, D])),
[Name];
{incomplete, L, D} ->
?E(?FMT("string split failed(incomplte): ~p ~p", [L, D])),
[Name];
{ok, R} -> R
end,
NameArray = seg_text(Name, Files),
{'_id', list_to_binary(Hash),
name, list_to_binary(Name),
name_array, NameArray,
@ -199,6 +192,19 @@ create_torrent_desc(_Conn, Hash, Name, Length, Announce, Files) ->
created_at, time_util:now_seconds(),
announce, Announce,
files, encode_file_list(Files)}.
seg_text(Name, Files) ->
FullName = lists:foldl(fun({S, _}, Acc) ->
Acc ++ " " ++ S
end, Name, Files),
seg_text(FullName).
seg_text(FullName) ->
case config:get(use_rmmseg, false) of
false -> list_to_binary(FullName);
true ->
rmmseg:seg_space(list_to_binary(FullName))
end.
-endif.
% {file1, {name, xx, length, xx}, file2, {name, xx, length, xx}}

View File

@ -34,6 +34,7 @@ start_standalone(IP, Port, Size) ->
start_dep_apps(),
tor_download:start_global(),
config:start_link("hash_reader.config", fun() -> config_default() end),
init_rmmseg(config:get(use_rmmseg, false)),
% NOTE:
Stats = {hash_reader_stats, {hash_reader_stats, start_link, [Size]}, permanent, 2000, worker, [hash_reader_stats]},
DownloadStats = {tor_download_stats, {tor_download_stats, start_link, []}, permanent, 2000, worker, [tor_download_stats]},
@ -41,6 +42,14 @@ start_standalone(IP, Port, Size) ->
DBDateRange = {db_daterange, {db_daterange, start_link, [?DBPOOLNAME]}, permanent, 1000, worker, [db_daterange]},
start_link(IP, Port, Size, [Log, DBDateRange, DownloadStats, Stats]).
init_rmmseg(true) ->
io:format("rmmseg is enabled~n", []),
rmmseg:init(),
rmmseg:load_dicts();
init_rmmseg(false) ->
io:format("rmmseg is disabled~n", []),
ok.
start_link(IP, Port, Size) ->
start_link(IP, Port, Size, []).
@ -72,4 +81,5 @@ config_default() ->
{save_to_db, false},
{save_to_file, true},
{load_from_db, false},
{use_rmmseg, false},
{torrent_path, "torrents/"}].

2
src/rmmseg/README.md Normal file
View File

@ -0,0 +1,2 @@
If you want to use rmmseg in dhtcrawler2 on Windows, and use WIN32 erlang, you can use rmmseg_win32.dll directly, copy it to priv directory.

46
src/rmmseg/rmmseg.erl Normal file
View File

@ -0,0 +1,46 @@
%%
%% rmmseg.erl
%% Kevin Lynx
%%
-module(rmmseg).
-export([init/0,
load_dicts/0,
seg_space/1,
load_dicts/2,
seg/1]).
-onload(init/0).
-compile(export_all).
init() ->
File = in_priv_path("rmmseg"),
ok = erlang:load_nif(File, 0).
load_dicts(_CharFile, _WordFile) ->
not_loaded.
seg(_BStr) ->
not_loaded.
load_dicts() ->
Chars = in_priv_path("chars.dic"),
Words = in_priv_path("words.dic"),
load_dicts(Chars, Words).
seg_space(BStr) when is_binary(BStr) ->
List = rmmseg:seg(BStr),
Ret = lists:foldl(fun(E, Acc) ->
case Acc == <<>> of
true -> E;
false ->
<<Acc/binary, " ", E/binary>>
end
end, <<>>, List),
Ret.
in_priv_path(Name) ->
filename:join([filename:dirname(code:which(?MODULE)), "..", "priv", Name]).
%%
sample() ->
not_loaded.

BIN
src/rmmseg/rmmseg_win32.dll Normal file

Binary file not shown.