mirror of
https://github.com/btdig/dhtcrawler2.git
synced 2025-02-23 21:59:04 +00:00
fix sphinx xml utf8 related issure, filter these unicode control characters, only backup delta file if the operation failed
This commit is contained in:
parent
1d870e2e42
commit
79291ab4e9
@ -3,8 +3,33 @@
|
|||||||
%% Kevin Lynx
|
%% Kevin Lynx
|
||||||
%%
|
%%
|
||||||
-module(string_util).
|
-module(string_util).
|
||||||
-export([format/2]).
|
-compile(export_all).
|
||||||
|
-export([format/2, strip_invalid_unicode/1]).
|
||||||
|
|
||||||
format(Fmt, Arg) when is_list(Fmt), is_list(Arg) ->
|
format(Fmt, Arg) when is_list(Fmt), is_list(Arg) ->
|
||||||
lists:flatten(io_lib:format(Fmt, Arg)).
|
lists:flatten(io_lib:format(Fmt, Arg)).
|
||||||
|
|
||||||
|
% strip these unicode control characters
|
||||||
|
strip_invalid_unicode(L) when is_list(L) ->
|
||||||
|
binary_to_list(strip_invalid_unicode(list_to_binary(L)));
|
||||||
|
strip_invalid_unicode(<<>>) ->
|
||||||
|
<<>>;
|
||||||
|
strip_invalid_unicode(<<C/utf8, R/binary>>) ->
|
||||||
|
case is_valid_unicode(C) of
|
||||||
|
true ->
|
||||||
|
RR = strip_invalid_unicode(R),
|
||||||
|
<<C/utf8, RR/binary>>;
|
||||||
|
false ->
|
||||||
|
strip_invalid_unicode(R)
|
||||||
|
end;
|
||||||
|
strip_invalid_unicode(<<_, R/binary>>) ->
|
||||||
|
strip_invalid_unicode(R).
|
||||||
|
|
||||||
|
is_valid_unicode(C) when C < 16#20 ->
|
||||||
|
false;
|
||||||
|
is_valid_unicode(C) when C >= 16#7f, C =< 16#ff ->
|
||||||
|
false;
|
||||||
|
is_valid_unicode(_) ->
|
||||||
|
true.
|
||||||
|
|
||||||
|
|
||||||
|
@ -18,10 +18,9 @@ search(Conn, Key, Offset, Count) ->
|
|||||||
{T2, TDocs} = case catch giza_request:send(Q4) of
|
{T2, TDocs} = case catch giza_request:send(Q4) of
|
||||||
{'EXIT', R} ->
|
{'EXIT', R} ->
|
||||||
?W(?FMT("sphinx search error ~p", [R])),
|
?W(?FMT("sphinx search error ~p", [R])),
|
||||||
[];
|
{now(), []};
|
||||||
{ok, Ret} ->
|
{ok, Ret} ->
|
||||||
T = now(),
|
{now(), decode_search_ret(Conn, Ret)}
|
||||||
{T, decode_search_ret(Conn, Ret)}
|
|
||||||
end,
|
end,
|
||||||
T3 = now(),
|
T3 = now(),
|
||||||
Stats = {timer:now_diff(T2, T1), timer:now_diff(T3, T2)},
|
Stats = {timer:now_diff(T2, T1), timer:now_diff(T3, T2)},
|
||||||
|
@ -27,7 +27,8 @@ do_build_init_index(MainFile, DeltaFile, CfgFile) ->
|
|||||||
build_delta_index(IndexFile, Delta, CfgFile, MinID, MaxID) ->
|
build_delta_index(IndexFile, Delta, CfgFile, MinID, MaxID) ->
|
||||||
Cmd = "indexer -c " ++ CfgFile ++ " --rotate " ++ Delta,
|
Cmd = "indexer -c " ++ CfgFile ++ " --rotate " ++ Delta,
|
||||||
Res = os:cmd(Cmd),
|
Res = os:cmd(Cmd),
|
||||||
Dest = backup_delta_file(Delta, MinID, MaxID, IndexFile),
|
Success = check_cmd_success(Res),
|
||||||
|
Dest = backup_delta_file(Delta, MinID, MaxID, IndexFile, Success),
|
||||||
?I(?FMT("command `~s' result on ~s~n" ++ Res, [Cmd, Dest])).
|
?I(?FMT("command `~s' result on ~s~n" ++ Res, [Cmd, Dest])).
|
||||||
|
|
||||||
merge_index(Main, Delta, CfgFile) ->
|
merge_index(Main, Delta, CfgFile) ->
|
||||||
@ -36,9 +37,13 @@ merge_index(Main, Delta, CfgFile) ->
|
|||||||
Res = os:cmd(Cmd),
|
Res = os:cmd(Cmd),
|
||||||
?I(?FMT("command `~s' result~n" ++ Res, [Cmd])).
|
?I(?FMT("command `~s' result~n" ++ Res, [Cmd])).
|
||||||
|
|
||||||
backup_delta_file(Delta, MinID, MaxID, IndexFile) ->
|
backup_delta_file(Delta, MinID, MaxID, IndexFile, Flag) ->
|
||||||
Path = filename:dirname(IndexFile),
|
Path = filename:dirname(IndexFile),
|
||||||
Dest = string_util:format(Path ++ "/" ++ Delta ++ "[~b-~b]" ++ ".xml",
|
Dest = string_util:format(Path ++ "/" ++ Delta ++ "[~b-~b]" ++ ".xml",
|
||||||
[MinID, MaxID]),
|
[MinID, MaxID]),
|
||||||
file:copy(IndexFile, Dest),
|
if not Flag -> file:copy(IndexFile, Dest); true -> skip end,
|
||||||
Dest.
|
Dest.
|
||||||
|
|
||||||
|
% too simple
|
||||||
|
check_cmd_success(Res) ->
|
||||||
|
string:str(Res, "succesfully") > 0.
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
%%
|
%%
|
||||||
-module(sphinx_xml).
|
-module(sphinx_xml).
|
||||||
-behaviour(gen_server).
|
-behaviour(gen_server).
|
||||||
|
-compile(export_all).
|
||||||
-include("vlog.hrl").
|
-include("vlog.hrl").
|
||||||
-export([init/1,
|
-export([init/1,
|
||||||
handle_call/3,
|
handle_call/3,
|
||||||
@ -48,9 +49,9 @@ handle_cast(save, #state{docs = Docs, ids = IDs} = State) when length(Docs) > 0
|
|||||||
handle_cast(stop, State) ->
|
handle_cast(stop, State) ->
|
||||||
{stop, normal, State}.
|
{stop, normal, State}.
|
||||||
|
|
||||||
handle_call({insert, {ID, Hash, Name, Files, Query, CreatedAt}}, _From, State) ->
|
handle_call({insert, DocT}, _From, State) ->
|
||||||
#state{docs = Docs, ids = IDs, max = Max} = State,
|
#state{docs = Docs, ids = IDs, max = Max} = State,
|
||||||
Doc = sphinx_doc:element(Hash, Name, Files, ID, Query, CreatedAt),
|
{ID, Doc} = create_doc(DocT),
|
||||||
{NewDocs, NewIDs} = try_save([Doc|Docs], Max, [ID|IDs]),
|
{NewDocs, NewIDs} = try_save([Doc|Docs], Max, [ID|IDs]),
|
||||||
{reply, ok, State#state{docs = NewDocs, ids = NewIDs}};
|
{reply, ok, State#state{docs = NewDocs, ids = NewIDs}};
|
||||||
|
|
||||||
@ -83,3 +84,20 @@ get_id_range([First|IDs]) ->
|
|||||||
lists:foldl(fun(ID, {Min, Max}) ->
|
lists:foldl(fun(ID, {Min, Max}) ->
|
||||||
{min(ID, Min), max(ID, Max)}
|
{min(ID, Min), max(ID, Max)}
|
||||||
end, {First, First}, IDs).
|
end, {First, First}, IDs).
|
||||||
|
|
||||||
|
create_doc({ID, Hash, Name, Files, Query, CreatedAt}) ->
|
||||||
|
ValidName = valid_name(Name),
|
||||||
|
ValidFiles = valid_file_names(Files),
|
||||||
|
Doc = sphinx_doc:element(Hash, ValidName, ValidFiles, ID, Query, CreatedAt),
|
||||||
|
{ID, Doc}.
|
||||||
|
|
||||||
|
valid_file_names(Files) ->
|
||||||
|
[{valid_name(Name), Length} || {Name, Length} <- Files].
|
||||||
|
|
||||||
|
valid_name(S) ->
|
||||||
|
ValidName = string_util:strip_invalid_unicode(S),
|
||||||
|
if length(ValidName) < length(S) ->
|
||||||
|
?I(?FMT("~s -> ~s", [S, ValidName]));
|
||||||
|
true -> ok
|
||||||
|
end,
|
||||||
|
ValidName.
|
||||||
|
Loading…
Reference in New Issue
Block a user