mirror of
https://github.com/btdig/dhtcrawler2.git
synced 2025-02-23 21:59:04 +00:00
fix sphinx xml utf8 related issure, filter these unicode control characters, only backup delta file if the operation failed
This commit is contained in:
parent
1d870e2e42
commit
79291ab4e9
@ -3,8 +3,33 @@
|
||||
%% Kevin Lynx
|
||||
%%
|
||||
-module(string_util).
|
||||
-export([format/2]).
|
||||
-compile(export_all).
|
||||
-export([format/2, strip_invalid_unicode/1]).
|
||||
|
||||
format(Fmt, Arg) when is_list(Fmt), is_list(Arg) ->
|
||||
lists:flatten(io_lib:format(Fmt, Arg)).
|
||||
|
||||
% strip these unicode control characters
|
||||
strip_invalid_unicode(L) when is_list(L) ->
|
||||
binary_to_list(strip_invalid_unicode(list_to_binary(L)));
|
||||
strip_invalid_unicode(<<>>) ->
|
||||
<<>>;
|
||||
strip_invalid_unicode(<<C/utf8, R/binary>>) ->
|
||||
case is_valid_unicode(C) of
|
||||
true ->
|
||||
RR = strip_invalid_unicode(R),
|
||||
<<C/utf8, RR/binary>>;
|
||||
false ->
|
||||
strip_invalid_unicode(R)
|
||||
end;
|
||||
strip_invalid_unicode(<<_, R/binary>>) ->
|
||||
strip_invalid_unicode(R).
|
||||
|
||||
is_valid_unicode(C) when C < 16#20 ->
|
||||
false;
|
||||
is_valid_unicode(C) when C >= 16#7f, C =< 16#ff ->
|
||||
false;
|
||||
is_valid_unicode(_) ->
|
||||
true.
|
||||
|
||||
|
||||
|
@ -18,10 +18,9 @@ search(Conn, Key, Offset, Count) ->
|
||||
{T2, TDocs} = case catch giza_request:send(Q4) of
|
||||
{'EXIT', R} ->
|
||||
?W(?FMT("sphinx search error ~p", [R])),
|
||||
[];
|
||||
{now(), []};
|
||||
{ok, Ret} ->
|
||||
T = now(),
|
||||
{T, decode_search_ret(Conn, Ret)}
|
||||
{now(), decode_search_ret(Conn, Ret)}
|
||||
end,
|
||||
T3 = now(),
|
||||
Stats = {timer:now_diff(T2, T1), timer:now_diff(T3, T2)},
|
||||
|
@ -27,7 +27,8 @@ do_build_init_index(MainFile, DeltaFile, CfgFile) ->
|
||||
build_delta_index(IndexFile, Delta, CfgFile, MinID, MaxID) ->
|
||||
Cmd = "indexer -c " ++ CfgFile ++ " --rotate " ++ Delta,
|
||||
Res = os:cmd(Cmd),
|
||||
Dest = backup_delta_file(Delta, MinID, MaxID, IndexFile),
|
||||
Success = check_cmd_success(Res),
|
||||
Dest = backup_delta_file(Delta, MinID, MaxID, IndexFile, Success),
|
||||
?I(?FMT("command `~s' result on ~s~n" ++ Res, [Cmd, Dest])).
|
||||
|
||||
merge_index(Main, Delta, CfgFile) ->
|
||||
@ -36,9 +37,13 @@ merge_index(Main, Delta, CfgFile) ->
|
||||
Res = os:cmd(Cmd),
|
||||
?I(?FMT("command `~s' result~n" ++ Res, [Cmd])).
|
||||
|
||||
backup_delta_file(Delta, MinID, MaxID, IndexFile) ->
|
||||
backup_delta_file(Delta, MinID, MaxID, IndexFile, Flag) ->
|
||||
Path = filename:dirname(IndexFile),
|
||||
Dest = string_util:format(Path ++ "/" ++ Delta ++ "[~b-~b]" ++ ".xml",
|
||||
[MinID, MaxID]),
|
||||
file:copy(IndexFile, Dest),
|
||||
if not Flag -> file:copy(IndexFile, Dest); true -> skip end,
|
||||
Dest.
|
||||
|
||||
% too simple
|
||||
check_cmd_success(Res) ->
|
||||
string:str(Res, "succesfully") > 0.
|
||||
|
@ -5,6 +5,7 @@
|
||||
%%
|
||||
-module(sphinx_xml).
|
||||
-behaviour(gen_server).
|
||||
-compile(export_all).
|
||||
-include("vlog.hrl").
|
||||
-export([init/1,
|
||||
handle_call/3,
|
||||
@ -48,9 +49,9 @@ handle_cast(save, #state{docs = Docs, ids = IDs} = State) when length(Docs) > 0
|
||||
handle_cast(stop, State) ->
|
||||
{stop, normal, State}.
|
||||
|
||||
handle_call({insert, {ID, Hash, Name, Files, Query, CreatedAt}}, _From, State) ->
|
||||
handle_call({insert, DocT}, _From, State) ->
|
||||
#state{docs = Docs, ids = IDs, max = Max} = State,
|
||||
Doc = sphinx_doc:element(Hash, Name, Files, ID, Query, CreatedAt),
|
||||
{ID, Doc} = create_doc(DocT),
|
||||
{NewDocs, NewIDs} = try_save([Doc|Docs], Max, [ID|IDs]),
|
||||
{reply, ok, State#state{docs = NewDocs, ids = NewIDs}};
|
||||
|
||||
@ -83,3 +84,20 @@ get_id_range([First|IDs]) ->
|
||||
lists:foldl(fun(ID, {Min, Max}) ->
|
||||
{min(ID, Min), max(ID, Max)}
|
||||
end, {First, First}, IDs).
|
||||
|
||||
create_doc({ID, Hash, Name, Files, Query, CreatedAt}) ->
|
||||
ValidName = valid_name(Name),
|
||||
ValidFiles = valid_file_names(Files),
|
||||
Doc = sphinx_doc:element(Hash, ValidName, ValidFiles, ID, Query, CreatedAt),
|
||||
{ID, Doc}.
|
||||
|
||||
valid_file_names(Files) ->
|
||||
[{valid_name(Name), Length} || {Name, Length} <- Files].
|
||||
|
||||
valid_name(S) ->
|
||||
ValidName = string_util:strip_invalid_unicode(S),
|
||||
if length(ValidName) < length(S) ->
|
||||
?I(?FMT("~s -> ~s", [S, ValidName]));
|
||||
true -> ok
|
||||
end,
|
||||
ValidName.
|
||||
|
Loading…
Reference in New Issue
Block a user