fix sphinx xml utf8 related issure, filter these unicode control characters, only backup delta file if the operation failed

Kevin Lynx 2013-08-01 23:17:52 +08:00
4 changed files with 56 additions and 9 deletions

%% Kevin Lynx
-export([format/2, strip_invalid_unicode/1]).
format(Fmt, Arg) when is_list(Fmt), is_list(Arg) ->
lists:flatten(io_lib:format(Fmt, Arg)).
% strip these unicode control characters
strip_invalid_unicode(L) when is_list(L) ->
strip_invalid_unicode(<<>>) ->
strip_invalid_unicode(<<C/utf8, R/binary>>) ->
case is_valid_unicode(C) of
true ->
RR = strip_invalid_unicode(R),
<<C/utf8, RR/binary>>;
false ->
strip_invalid_unicode(<<_, R/binary>>) ->
is_valid_unicode(C) when C < 16#20 ->
is_valid_unicode(C) when C >= 16#7f, C =< 16#ff ->
is_valid_unicode(_) ->

{T2, TDocs} = case catch giza_request:send(Q4) of
{'EXIT', R} ->
?W(?FMT("sphinx search error ~p", [R])),
{now(), []};
{ok, Ret} ->
T = now(),
{T, decode_search_ret(Conn, Ret)}
{now(), decode_search_ret(Conn, Ret)}
T3 = now(),
Stats = {timer:now_diff(T2, T1), timer:now_diff(T3, T2)},

build_delta_index(IndexFile, Delta, CfgFile, MinID, MaxID) ->
Cmd = "indexer -c " ++ CfgFile ++ " --rotate " ++ Delta,
Res = os:cmd(Cmd),
Dest = backup_delta_file(Delta, MinID, MaxID, IndexFile),
Success = check_cmd_success(Res),
Dest = backup_delta_file(Delta, MinID, MaxID, IndexFile, Success),
?I(?FMT("command `~s' result on ~s~n" ++ Res, [Cmd, Dest])).
merge_index(Main, Delta, CfgFile) ->
Res = os:cmd(Cmd),
?I(?FMT("command `~s' result~n" ++ Res, [Cmd])).
backup_delta_file(Delta, MinID, MaxID, IndexFile) ->
backup_delta_file(Delta, MinID, MaxID, IndexFile, Flag) ->
Path = filename:dirname(IndexFile),
Dest = string_util:format(Path ++ "/" ++ Delta ++ "[~b-~b]" ++ ".xml",
[MinID, MaxID]),
file:copy(IndexFile, Dest),
if not Flag -> file:copy(IndexFile, Dest); true -> skip end,
% too simple
check_cmd_success(Res) ->
string:str(Res, "succesfully") > 0.

handle_cast(stop, State) ->
{stop, normal, State}.
handle_call({insert, {ID, Hash, Name, Files, Query, CreatedAt}}, _From, State) ->
handle_call({insert, DocT}, _From, State) ->
#state{docs = Docs, ids = IDs, max = Max} = State,
Doc = sphinx_doc:element(Hash, Name, Files, ID, Query, CreatedAt),
{ID, Doc} = create_doc(DocT),
{NewDocs, NewIDs} = try_save([Doc|Docs], Max, [ID|IDs]),
{reply, ok, State#state{docs = NewDocs, ids = NewIDs}};
@ -83,3 +84,20 @@ get_id_range([First|IDs]) ->
lists:foldl(fun(ID, {Min, Max}) ->
{min(ID, Min), max(ID, Max)}
end, {First, First}, IDs).
create_doc({ID, Hash, Name, Files, Query, CreatedAt}) ->
ValidName = valid_name(Name),
ValidFiles = valid_file_names(Files),
Doc = sphinx_doc:element(Hash, ValidName, ValidFiles, ID, Query, CreatedAt),
{ID, Doc}.
valid_file_names(Files) ->
[{valid_name(Name), Length} || {Name, Length} <- Files].
valid_name(S) ->
ValidName = string_util:strip_invalid_unicode(S),
if length(ValidName) < length(S) ->
?I(?FMT("~s -> ~s", [S, ValidName]));
true -> ok