mirror of
https://github.com/btdig/dhtcrawler2.git
synced 2025-02-07 05:49:06 +00:00
fix some sphinx related bugs, now it can be used to build sphinx index, still in experiment stage, add `giza' library to query sphinx in http_fontend
This commit is contained in:
parent
7ab79b5d2e
commit
7b1a435a43
@ -4,6 +4,7 @@
|
|||||||
{ibrowse, ".*", {git, "git@github.com:cmullaparthi/ibrowse.git", "HEAD"}},
|
{ibrowse, ".*", {git, "git@github.com:cmullaparthi/ibrowse.git", "HEAD"}},
|
||||||
{bson, ".*", {git, "git@github.com:mongodb/bson-erlang.git", "HEAD"}},
|
{bson, ".*", {git, "git@github.com:mongodb/bson-erlang.git", "HEAD"}},
|
||||||
{mongodb, ".*", {git, "git@github.com:mongodb/mongodb-erlang.git", "HEAD"}},
|
{mongodb, ".*", {git, "git@github.com:mongodb/mongodb-erlang.git", "HEAD"}},
|
||||||
{kdht, ".*", {git, "git@github.com:kevinlynx/kdht.git", "HEAD"}}
|
{kdht, ".*", {git, "git@github.com:kevinlynx/kdht.git", "HEAD"}},
|
||||||
|
{giza, ".*", {git, "https://github.com/kevinlynx/giza.git", "HEAD"}}
|
||||||
]}.
|
]}.
|
||||||
|
|
||||||
|
@ -32,6 +32,7 @@ start(DBHost, DBPort, Port, PoolSize) ->
|
|||||||
vlog:start_link("log/crawler_http.log", ?INFO),
|
vlog:start_link("log/crawler_http.log", ?INFO),
|
||||||
code:add_path("deps/bson/ebin"),
|
code:add_path("deps/bson/ebin"),
|
||||||
code:add_path("deps/mongodb/ebin"),
|
code:add_path("deps/mongodb/ebin"),
|
||||||
|
code:add_path("deps/giza/ebin"),
|
||||||
Apps = [crypto, public_key, ssl, inets, bson, mongodb],
|
Apps = [crypto, public_key, ssl, inets, bson, mongodb],
|
||||||
[application:start(App) || App <- Apps],
|
[application:start(App) || App <- Apps],
|
||||||
gen_server:start({local, srv_name()}, ?MODULE, [DBHost, DBPort, Port, PoolSize], []).
|
gen_server:start({local, srv_name()}, ?MODULE, [DBHost, DBPort, Port, PoolSize], []).
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
code_change/3]).
|
code_change/3]).
|
||||||
-export([start_link/3]).
|
-export([start_link/3]).
|
||||||
-export([worker_run/0]).
|
-export([worker_run/0]).
|
||||||
-record(state, {processed = 0, worker_cnt, wait_workers = []}).
|
-record(state, {processed = 0, saved = false, worker_cnt, wait_workers = []}).
|
||||||
-define(WORKER_WAIT, 30*1000).
|
-define(WORKER_WAIT, 30*1000).
|
||||||
-define(STATE_FILE, "priv/sphinx_builder.sta").
|
-define(STATE_FILE, "priv/sphinx_builder.sta").
|
||||||
|
|
||||||
@ -32,7 +32,8 @@ init([IP, Port, WorkerCnt]) ->
|
|||||||
{ok, #state{processed = Offset, worker_cnt = WorkerCnt}}.
|
{ok, #state{processed = Offset, worker_cnt = WorkerCnt}}.
|
||||||
|
|
||||||
handle_call({get, Pid}, _From, State) ->
|
handle_call({get, Pid}, _From, State) ->
|
||||||
#state{processed = Processed, worker_cnt = WorkerCnt, wait_workers = WaitWorkers} = State,
|
#state{processed = Processed, worker_cnt = WorkerCnt, wait_workers = WaitWorkers,
|
||||||
|
saved = Saved} = State,
|
||||||
{NewProcessed, Ret} = case sphinx_torrent:get() of
|
{NewProcessed, Ret} = case sphinx_torrent:get() of
|
||||||
{} ->
|
{} ->
|
||||||
{Processed, wait};
|
{Processed, wait};
|
||||||
@ -41,8 +42,9 @@ handle_call({get, Pid}, _From, State) ->
|
|||||||
{Processed + 1, Tor}
|
{Processed + 1, Tor}
|
||||||
end,
|
end,
|
||||||
NewWaits = update_wait_workers(Pid, NewProcessed, Processed, WaitWorkers),
|
NewWaits = update_wait_workers(Pid, NewProcessed, Processed, WaitWorkers),
|
||||||
check_all_done(NewWaits, WorkerCnt, NewProcessed, length(NewWaits) > length(WaitWorkers)),
|
NewSaved = (check_all_done(NewWaits, WorkerCnt, NewProcessed, Saved)) and (Ret == wait),
|
||||||
{reply, {NewProcessed, Ret}, State#state{processed = NewProcessed, wait_workers = NewWaits}}.
|
{reply, {NewProcessed, Ret}, State#state{processed = NewProcessed, wait_workers = NewWaits,
|
||||||
|
saved = NewSaved}}.
|
||||||
|
|
||||||
handle_cast(_, State) ->
|
handle_cast(_, State) ->
|
||||||
{noreply, State}.
|
{noreply, State}.
|
||||||
@ -66,19 +68,20 @@ update_wait_workers(Pid, NewProcessed, Processed, WaitWorkers) ->
|
|||||||
WaitWorkers
|
WaitWorkers
|
||||||
end.
|
end.
|
||||||
|
|
||||||
check_all_done(WaitWorkers, WorkerCnt, Processed, true)
|
check_all_done(WaitWorkers, WorkerCnt, Processed, false)
|
||||||
when length(WaitWorkers) == WorkerCnt ->
|
when length(WaitWorkers) == WorkerCnt ->
|
||||||
Try = sphinx_torrent:try_times(),
|
Try = sphinx_torrent:try_times(),
|
||||||
case Try > 5 of
|
case Try > 5 of
|
||||||
true ->
|
true ->
|
||||||
io:format("haven't got any torrents for a while, force save~n", []),
|
io:format("haven't got any torrents for a while, force save~n", []),
|
||||||
save_result(Processed),
|
save_result(Processed),
|
||||||
sphinx_xml:force_save();
|
sphinx_xml:force_save(),
|
||||||
|
true;
|
||||||
false ->
|
false ->
|
||||||
ok
|
false
|
||||||
end;
|
end;
|
||||||
check_all_done(_WaitWorkers, _WaitCnt, _Processed, _) ->
|
check_all_done(_WaitWorkers, _WaitCnt, _Processed, Saved) ->
|
||||||
ok.
|
Saved.
|
||||||
|
|
||||||
worker_run() ->
|
worker_run() ->
|
||||||
Ret = gen_server:call(srv_name(), {get, self()}),
|
Ret = gen_server:call(srv_name(), {get, self()}),
|
||||||
@ -91,9 +94,9 @@ do_process({_, wait}) ->
|
|||||||
do_process({ID, Doc}) ->
|
do_process({ID, Doc}) ->
|
||||||
case db_store_mongo:decode_torrent_item(Doc) of
|
case db_store_mongo:decode_torrent_item(Doc) of
|
||||||
{single, Hash, {Name, _}, Query, CreatedAt} ->
|
{single, Hash, {Name, _}, Query, CreatedAt} ->
|
||||||
sphinx_xml:insert({Hash, Name, [], ID, Query, CreatedAt});
|
sphinx_xml:insert({ID, Hash, Name, [], Query, CreatedAt});
|
||||||
{multi, Hash, {Name, Files}, Query, CreatedAt} ->
|
{multi, Hash, {Name, Files}, Query, CreatedAt} ->
|
||||||
sphinx_xml:insert({Hash, Name, Files, ID, Query, CreatedAt})
|
sphinx_xml:insert({ID, Hash, Name, Files, Query, CreatedAt})
|
||||||
end.
|
end.
|
||||||
|
|
||||||
load_result() ->
|
load_result() ->
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
-module(sphinx_builder_sup).
|
-module(sphinx_builder_sup).
|
||||||
-behaviour(supervisor).
|
-behaviour(supervisor).
|
||||||
-export([start_standalone/1, start_standalone/3, start_link/3]).
|
-export([start_standalone/1, start_standalone/3, start_link/3]).
|
||||||
|
-export([init_indexes/0]).
|
||||||
-export([init/1]).
|
-export([init/1]).
|
||||||
|
|
||||||
start_dep_apps() ->
|
start_dep_apps() ->
|
||||||
@ -29,6 +30,13 @@ start_standalone(IP, Port, Size) ->
|
|||||||
start_link(IP, Port, Count) ->
|
start_link(IP, Port, Count) ->
|
||||||
supervisor:start_link({local, srv_name()}, ?MODULE, [IP, Port, Count]).
|
supervisor:start_link({local, srv_name()}, ?MODULE, [IP, Port, Count]).
|
||||||
|
|
||||||
|
init_indexes() ->
|
||||||
|
config:start_link("sphinx_builder.config", fun() -> config_default() end),
|
||||||
|
io:format("try init sphinx index files~n", []),
|
||||||
|
Conf = config:get(sphinx_config_file),
|
||||||
|
MainFile = config:get(main_source_file),
|
||||||
|
DeltaFile = config:get(delta_source_file),
|
||||||
|
sphinx_cmd:build_init_index(MainFile, DeltaFile, Conf).
|
||||||
%%
|
%%
|
||||||
srv_name() ->
|
srv_name() ->
|
||||||
?MODULE.
|
?MODULE.
|
||||||
@ -38,14 +46,14 @@ init([IP, Port, Count]) ->
|
|||||||
config:start_link("sphinx_builder.config", fun() -> config_default() end),
|
config:start_link("sphinx_builder.config", fun() -> config_default() end),
|
||||||
Builder = {sphinx_builder, {sphinx_builder, start_link, [IP, Port, Count]}, permanent, 1000, worker, [sphinx_builder]},
|
Builder = {sphinx_builder, {sphinx_builder, start_link, [IP, Port, Count]}, permanent, 1000, worker, [sphinx_builder]},
|
||||||
Indexer = {sphinx_xml, {sphinx_xml, start_link, []}, permanent, 1000, worker, [sphinx_xml]},
|
Indexer = {sphinx_xml, {sphinx_xml, start_link, []}, permanent, 1000, worker, [sphinx_xml]},
|
||||||
Logger = {vlog, {vlog, start_link, ["log/sphinx_build.log", 0]}, permanent, 1000, worker, [vlog]},
|
Logger = {vlog, {vlog, start_link, ["log/sphinx_build.log", 1]}, permanent, 1000, worker, [vlog]},
|
||||||
Children = [Logger, Builder, Indexer],
|
Children = [Logger, Builder, Indexer],
|
||||||
{ok, {Spec, Children}}.
|
{ok, {Spec, Children}}.
|
||||||
|
|
||||||
config_default() ->
|
config_default() ->
|
||||||
[{load_torrent_interval, 500}, % millseconds
|
[{max_doc_per_file, 1000},
|
||||||
{max_doc_per_file, 1000},
|
|
||||||
{torrent_batch_count, 100},
|
{torrent_batch_count, 100},
|
||||||
|
{main_source_file, "var/source/main.xml"},
|
||||||
{delta_source_file, "var/source/delta.xml"},
|
{delta_source_file, "var/source/delta.xml"},
|
||||||
{sphinx_config_file, "var/etc/csft.conf"},
|
{sphinx_config_file, "var/etc/csft.conf"},
|
||||||
{delta_index_name, "delta"},
|
{delta_index_name, "delta"},
|
||||||
|
@ -4,10 +4,25 @@
|
|||||||
%% 07.28.2013
|
%% 07.28.2013
|
||||||
%%
|
%%
|
||||||
-module(sphinx_cmd).
|
-module(sphinx_cmd).
|
||||||
-export([build_delta_index/5, merge_index/3]).
|
-export([build_init_index/3, build_delta_index/5, merge_index/3]).
|
||||||
-compile(export_all).
|
-compile(export_all).
|
||||||
-include("vlog.hrl").
|
-include("vlog.hrl").
|
||||||
|
|
||||||
|
build_init_index(MainFile, DeltaFile, CfgFile) ->
|
||||||
|
case filelib:is_file(MainFile) and filelib:is_file(DeltaFile) of
|
||||||
|
true ->
|
||||||
|
io:format("main/delta index file exists, ignore~n", []);
|
||||||
|
false ->
|
||||||
|
do_build_init_index(MainFile, DeltaFile, CfgFile)
|
||||||
|
end.
|
||||||
|
|
||||||
|
do_build_init_index(MainFile, DeltaFile, CfgFile) ->
|
||||||
|
sphinx_doc:write_test_xml(MainFile),
|
||||||
|
sphinx_doc:write_test_xml(DeltaFile),
|
||||||
|
Cmd = "indexer -c " ++ CfgFile ++ " --all",
|
||||||
|
Ret = os:cmd(Cmd),
|
||||||
|
io:format("~p~n", [Ret]).
|
||||||
|
|
||||||
% Index file, Delta index name
|
% Index file, Delta index name
|
||||||
build_delta_index(IndexFile, Delta, CfgFile, MinID, MaxID) ->
|
build_delta_index(IndexFile, Delta, CfgFile, MinID, MaxID) ->
|
||||||
Cmd = "indexer -c " ++ CfgFile ++ " --rotate " ++ Delta,
|
Cmd = "indexer -c " ++ CfgFile ++ " --rotate " ++ Delta,
|
||||||
|
@ -4,12 +4,16 @@
|
|||||||
%%
|
%%
|
||||||
-module(sphinx_doc).
|
-module(sphinx_doc).
|
||||||
-include_lib("xmerl/include/xmerl.hrl").
|
-include_lib("xmerl/include/xmerl.hrl").
|
||||||
-export([write_xml/2, element/6]).
|
-export([write_test_xml/1, write_xml/2, element/6]).
|
||||||
-compile(export_all).
|
-compile(export_all).
|
||||||
-define(PROLOG, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>").
|
-define(PROLOG, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>").
|
||||||
-define(CR, #xmlText{value="\
|
-define(CR, #xmlText{value="\
|
||||||
"}).
|
"}).
|
||||||
|
|
||||||
|
write_test_xml(File) ->
|
||||||
|
Elem = element("33FB6D00DD5E363653235449527EC1DC9959FCAB", "test", [], 1, 1, 1374508800),
|
||||||
|
write_xml(File, [Elem]).
|
||||||
|
|
||||||
write_xml(File, Elems) ->
|
write_xml(File, Elems) ->
|
||||||
Doc = {'sphinx:docset', [], [schema(), ?CR] ++ Elems},
|
Doc = {'sphinx:docset', [], [schema(), ?CR] ++ Elems},
|
||||||
Content = xmerl:export_simple([Doc], xmerl_xml_cdata, [{prolog, ?PROLOG}]),
|
Content = xmerl:export_simple([Doc], xmerl_xml_cdata, [{prolog, ?PROLOG}]),
|
||||||
|
@ -13,13 +13,13 @@
|
|||||||
terminate/2,
|
terminate/2,
|
||||||
code_change/3]).
|
code_change/3]).
|
||||||
-export([start_link/0, insert/1, force_save/0]).
|
-export([start_link/0, insert/1, force_save/0]).
|
||||||
-record(state, {docs = [], max, startid = -1}).
|
-record(state, {docs = [], ids = [], max}).
|
||||||
|
|
||||||
start_link() ->
|
start_link() ->
|
||||||
gen_server:start_link({local, srv_name()}, ?MODULE, [], []).
|
gen_server:start_link({local, srv_name()}, ?MODULE, [], []).
|
||||||
|
|
||||||
insert(Doc) ->
|
insert(Doc) ->
|
||||||
gen_server:cast(srv_name(), {insert, Doc}).
|
gen_server:call(srv_name(), {insert, Doc}, infinity).
|
||||||
|
|
||||||
force_save() ->
|
force_save() ->
|
||||||
gen_server:cast(srv_name(), save).
|
gen_server:cast(srv_name(), save).
|
||||||
@ -38,42 +38,48 @@ terminate(_, State) ->
|
|||||||
code_change(_, _, State) ->
|
code_change(_, _, State) ->
|
||||||
{ok, State}.
|
{ok, State}.
|
||||||
|
|
||||||
handle_cast(save, #state{docs = Docs} = State) when length(Docs) > 0 ->
|
handle_cast(save, #state{docs = []} = State) ->
|
||||||
#state{startid = StartID} = State,
|
{noreply, State};
|
||||||
EndID = length(Docs) + StartID - 1,
|
|
||||||
try_save(Docs, 0, StartID, EndID),
|
|
||||||
{noreply, State#state{docs = []}};
|
|
||||||
|
|
||||||
handle_cast({insert, {Hash, Name, Files, ID, Query, CreatedAt}}, State) ->
|
handle_cast(save, #state{docs = Docs, ids = IDs} = State) when length(Docs) > 0 ->
|
||||||
#state{docs = Docs, max = Max, startid = StartID} = State,
|
try_save(Docs, 0, IDs),
|
||||||
NewStartID = if length(Docs) == 0 -> ID; true -> StartID end,
|
{noreply, State#state{docs = [], ids = []}};
|
||||||
Doc = sphinx_doc:element(Hash, Name, Files, ID, Query, CreatedAt),
|
|
||||||
NewDocs = try_save([Doc|Docs], Max, NewStartID, ID),
|
|
||||||
{noreply, State#state{docs = NewDocs, startid = NewStartID}};
|
|
||||||
|
|
||||||
handle_cast(stop, State) ->
|
handle_cast(stop, State) ->
|
||||||
{stop, normal, State}.
|
{stop, normal, State}.
|
||||||
|
|
||||||
|
handle_call({insert, {ID, Hash, Name, Files, Query, CreatedAt}}, _From, State) ->
|
||||||
|
#state{docs = Docs, ids = IDs, max = Max} = State,
|
||||||
|
Doc = sphinx_doc:element(Hash, Name, Files, ID, Query, CreatedAt),
|
||||||
|
{NewDocs, NewIDs} = try_save([Doc|Docs], Max, [ID|IDs]),
|
||||||
|
{reply, ok, State#state{docs = NewDocs, ids = NewIDs}};
|
||||||
|
|
||||||
handle_call(_, _From, State) ->
|
handle_call(_, _From, State) ->
|
||||||
{noreply, State}.
|
{noreply, State}.
|
||||||
|
|
||||||
handle_info(_, State) ->
|
handle_info(_, State) ->
|
||||||
{noreply, State}.
|
{noreply, State}.
|
||||||
|
|
||||||
try_save(Docs, Max, StartID, NowID) when length(Docs) >= Max, length(Docs) > 0 ->
|
try_save(Docs, Max, IDs) when length(Docs) >= Max, length(Docs) > 0 ->
|
||||||
File = config:get(delta_source_file),
|
File = config:get(delta_source_file),
|
||||||
Conf = config:get(sphinx_config_file),
|
Conf = config:get(sphinx_config_file),
|
||||||
Delta = config:get(delta_index_name),
|
Delta = config:get(delta_index_name),
|
||||||
Main = config:get(main_index_name),
|
Main = config:get(main_index_name),
|
||||||
|
{StartID, EndID} = get_id_range(IDs),
|
||||||
io:format("sync sphinx index ~p documents...", [length(Docs)]),
|
io:format("sync sphinx index ~p documents...", [length(Docs)]),
|
||||||
?I(?FMT("save sphinx xml file ~s", [File])),
|
?I(?FMT("save sphinx xml file ~s", [File])),
|
||||||
sphinx_doc:write_xml(File, Docs),
|
sphinx_doc:write_xml(File, Docs),
|
||||||
?I(?FMT("build delta index : ~s", [Delta])),
|
?I(?FMT("build delta index : ~s", [Delta])),
|
||||||
sphinx_cmd:build_delta_index(File, Delta, Conf, StartID, NowID),
|
sphinx_cmd:build_delta_index(File, Delta, Conf, StartID, EndID),
|
||||||
?I(?FMT("merge delta to main index ~s -> ~s", [Delta, Main])),
|
?I(?FMT("merge delta to main index ~s -> ~s", [Delta, Main])),
|
||||||
sphinx_cmd:merge_index(Main, Delta, Conf),
|
sphinx_cmd:merge_index(Main, Delta, Conf),
|
||||||
?I("index updated done"),
|
?I("index updated done"),
|
||||||
io:format("done~n", []),
|
io:format("done~n", []),
|
||||||
[];
|
{[], []};
|
||||||
try_save(Docs, _, _, _) ->
|
try_save(Docs, _, IDs) ->
|
||||||
Docs.
|
{Docs, IDs}.
|
||||||
|
|
||||||
|
get_id_range([First|IDs]) ->
|
||||||
|
lists:foldl(fun(ID, {Min, Max}) ->
|
||||||
|
{min(ID, Min), max(ID, Max)}
|
||||||
|
end, {First, First}, IDs).
|
||||||
|
@ -1,3 +1,7 @@
|
|||||||
|
## 07.30.2013
|
||||||
|
|
||||||
|
* add sphinx (coreseek which based on sphinx) to help searhcing, in expirement stage
|
||||||
|
|
||||||
## 07.21.2013
|
## 07.21.2013
|
||||||
|
|
||||||
* rewrite hash_reader, now it will keep a wait_download cache
|
* rewrite hash_reader, now it will keep a wait_download cache
|
||||||
|
@ -3,10 +3,12 @@ mkdir bin\deps\bson\ebin
|
|||||||
mkdir bin\deps\mongodb\ebin
|
mkdir bin\deps\mongodb\ebin
|
||||||
mkdir bin\deps\kdht\ebin
|
mkdir bin\deps\kdht\ebin
|
||||||
mkdir bin\deps\ibrowse\ebin
|
mkdir bin\deps\ibrowse\ebin
|
||||||
|
mkdir bin\deps\giza\ebin
|
||||||
copy deps\bson\ebin\*.* bin\deps\bson\ebin\
|
copy deps\bson\ebin\*.* bin\deps\bson\ebin\
|
||||||
copy deps\mongodb\ebin\*.* bin\deps\mongodb\ebin\
|
copy deps\mongodb\ebin\*.* bin\deps\mongodb\ebin\
|
||||||
copy deps\kdht\ebin\*.* bin\deps\kdht\ebin\
|
copy deps\kdht\ebin\*.* bin\deps\kdht\ebin\
|
||||||
copy deps\ibrowse\ebin\*.* bin\deps\ibrowse\ebin\
|
copy deps\ibrowse\ebin\*.* bin\deps\ibrowse\ebin\
|
||||||
|
copy deps\giza\ebin\*.* bin\deps\giza\ebin\
|
||||||
mkdir bin\www
|
mkdir bin\www
|
||||||
copy www\*.* bin\www\
|
copy www\*.* bin\www\
|
||||||
copy tools\*.* bin\
|
copy tools\*.* bin\
|
||||||
|
1
tools/win_init_sphinx_index.bat
Normal file
1
tools/win_init_sphinx_index.bat
Normal file
@ -0,0 +1 @@
|
|||||||
|
erl -pa ebin -noshell -run sphinx_builder_sup init_indexes
|
1
tools/win_start_sphinx_builder.bat
Normal file
1
tools/win_start_sphinx_builder.bat
Normal file
@ -0,0 +1 @@
|
|||||||
|
erl -pa ebin -noshell -run sphinx_builder_sup start_standalone localhost 27017 5
|
Loading…
Reference in New Issue
Block a user