From 8ba358e3c632110321f11a2205f07ed6a0285a15 Mon Sep 17 00:00:00 2001 From: LangLangbart <92653266+LangLangBart@users.noreply.github.com> Date: Wed, 23 Oct 2024 16:19:48 +0200 Subject: [PATCH] feat: add pcregrep support for multiline patterns --- gh-find-code | 56 ++++++++++++++++++++++++++++++++++++---------------- readme.md | 5 +++++ 2 files changed, 44 insertions(+), 17 deletions(-) diff --git a/gh-find-code b/gh-find-code index 2a8535b..44848f2 100755 --- a/gh-find-code +++ b/gh-find-code @@ -526,15 +526,20 @@ gh_query() { # processing these patterns later, split on \x1f, which is equivalent to the \u001F. # https://condor.depaul.edu/sjost/lsp121/documents/ascii-npr.htm # https://datatracker.ietf.org/doc/html/rfc20#section-4.1 - # Remove all newline characters from any pattern - patterns: ([.value.text_matches[] | .. | .text? | select(type=="string") | gsub("\n"; "")] as $patterns_array | + + # Remove leading and trailing whitespace (including spaces and newlines) from the + # patterns. Replace any remaining newline characters within the patterns with the + # Unicode symbol for newline (␤) to maintain single-line processing. Note: Patterns with + # newlines will not match correctly in subsequent processing unless pcregrep is + # installed. In that case, the symbol will be replaced by a newline during matching. + patterns: ([.value.text_matches[] | .. | .text? | select(type=="string") | + sub("^\\\s+"; "") | sub("\\\s+$"; "") | gsub("\n"; "\u2424")] as $patterns_array | if $patterns_array == [] then "__NoPatternFound__" else $patterns_array | unique | join("\u001F") end) # Separating the fields with the Record Separator (RS). @tsv is not suitable because it # double-escapes escaped characters. The @tsv had the advantage of printing its input as a - # single line, thus it is necessary to remove trailing newline characters from patterns. @sh - # is also not viable as it uses spaces as delimiters, which cannot be reliably used since - # file paths can contain spaces. + # single line. @sh is also not viable as it uses spaces as delimiters, which cannot be + # reliably used since file paths can contain spaces. } | [.index, .owner_repo_name, .file_name, .file_path, .patterns] | join("\u001e"))' \ 2>"$store_gh_search_error") || [[ -z $data ]]; then if grep --quiet --ignore-case "API rate limit exceeded" "$store_gh_search_error"; then @@ -698,18 +703,33 @@ EOF if [[ $patterns != "__NoPatternFound__" ]]; then # Patterns split by 'Unit Separator (US)' IFS=$'\x1F' read -ra pattern_array <<<"$patterns" - grep_args=() - for pattern in "${pattern_array[@]}"; do - grep_args+=("--regexp=$pattern") - done - - # Use the '--text' flag, as grep will simply print 'Binary file … matches' if - # the file contains binary characters. It won't even throw an error. - # https://unix.stackexchange.com/questions/19907 - command grep --color=never --line-number --text --fixed-strings "${grep_args[@]}" -- \ - "${store_file_contents}_${index}_fetched" 2>"${redirect_location}" | - command cut -d: -f1 >>"${store_file_contents}_${index}_line_numbers" + grep_args=("--color=never" "--line-number" "--text") + if command -v pcregrep >/dev/null && + command grep --quiet --max-count=1 --fixed-strings '␤' <<<"$patterns"; then + # The API sometimes returns a newline character as part of a pattern + # especially for texts where lots of Chinese characters are used, the + # character is replaced with '␤' inside the jq query, if such a pattern + # contains '␤' replace it with '\n' and try it with pcregrep multiline. + for pattern in "${pattern_array[@]}"; do + sanitized_patterns=$(command sed 's/[][?*+.$^(){}]/\\&/g' <<<"$pattern") + grep_args+=("--regexp=${sanitized_patterns//␤/\\n}") + done + command pcregrep --multiline "${grep_args[@]}" -- \ + "${store_file_contents}_${index}_fetched" 2>"${redirect_location}" | + command cut -d: -f1 >>"${store_file_contents}_${index}_line_numbers" + else + for pattern in "${pattern_array[@]}"; do + grep_args+=("--regexp=$pattern") + done + + # Use the '--text' flag, as grep will simply print 'Binary file … matches' if + # the file contains binary characters. It won't even throw an error. + # https://unix.stackexchange.com/questions/19907 + command grep --fixed-strings "${grep_args[@]}" -- \ + "${store_file_contents}_${index}_fetched" 2>"${redirect_location}" | + command cut -d: -f1 >>"${store_file_contents}_${index}_line_numbers" + fi # Save debug info only if an error is encountered if ((GHFC_DEBUG_MODE)) && [[ -s ${store_grep_extended_debug}_${index} ]]; then { @@ -733,11 +753,13 @@ EOF if ! base_name=$(command basename "$file_path" 2>/dev/null); then base_name="…${file_path: -30}" fi + + line_number=1 if [[ -s "${store_file_contents}_${index}_line_numbers" ]]; then line_number=$(command head -1 "${store_file_contents}_${index}_line_numbers") fi printf "%s\t%s\t%b%-3d%b\t%b%s%b/%b%s%b\t%b%s/%b%s%b\n" \ - "${line_number:-1}" "$file_extension" "$index_color" \ + "$line_number" "$file_extension" "$index_color" \ "$index" "$COLOR_RESET" "$CYAN_NORMAL" "${owner_repo_name%/*}" "$COLOR_RESET" \ "$CYAN_BOLD" "${owner_repo_name#*/}" "$COLOR_RESET" "$MAGENTA_NORMAL" \ "$dir_name" "$MAGENTA_BOLD" "$base_name" "$COLOR_RESET" | diff --git a/readme.md b/readme.md index efa3bdd..36e780e 100644 --- a/readme.md +++ b/readme.md @@ -189,6 +189,11 @@ GHFC_HISTORY_FILE="/custom/location/history.txt" gh find-code GHFC_HISTORY_LIMIT="1000" gh find-code ``` +### pcregrep +- If the API returns patterns with newline characters, `pcregrep` will be used to find line numbers + if installed; otherwise, `grep` will be used by default, which may not match patterns containing + newlines. + --- ## 💪 Contributing