From ec85a753ccfaf4b9be01e8364b86a5c21a5da136 Mon Sep 17 00:00:00 2001 From: LangLangbart <92653266+LangLangBart@users.noreply.github.com> Date: Thu, 24 Oct 2024 09:08:59 +0200 Subject: [PATCH] feat: expand tool set for multiline pattern matching --- gh-find-code | 31 +++++++++++++++++++++++-------- readme.md | 8 ++++---- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/gh-find-code b/gh-find-code index 44848f2..c6914d0 100755 --- a/gh-find-code +++ b/gh-find-code @@ -151,6 +151,7 @@ min_gh_version="2.37.0" # https://docs.python.org/3/library/urllib.parse.html min_python_version="3.0.0" python_executable="" +multiline_grep_executable="" # Creating temporary files. The current setup works but is very verbose. An attempt to use # associative arrays with 'declare -A' was unsuccessful as I couldn't access the associated # filename in child processes. @@ -273,6 +274,15 @@ validate_environment() { # If no suitable python version was found, terminate the script [[ -z $python_executable ]] && die "No suitable 'python' version found. Required: 'python >= $min_python_version'." + # Optional: Check for advanced pattern matching support + for value in pcre2grep pcregrep rg; do + if command -v $value >/dev/null && + $value --quiet --multiline --regexp='A\nB' <<<"$(printf "A\nB")" >/dev/null 2>&1; then + multiline_grep_executable="$value" + break + fi + done + # Verify if there are at least two spaces between columns. The delimiter in 'fzf' is set to # '\t' or '\s\s+' to separate fields. By default, the 'column' command should separate any # columns with two spaces. If this is not the case, you cannot proceed. It appears that @@ -530,8 +540,9 @@ gh_query() { # Remove leading and trailing whitespace (including spaces and newlines) from the # patterns. Replace any remaining newline characters within the patterns with the # Unicode symbol for newline (␤) to maintain single-line processing. Note: Patterns with - # newlines will not match correctly in subsequent processing unless pcregrep is - # installed. In that case, the symbol will be replaced by a newline during matching. + # newlines will not match correctly in subsequent processing unless a tool with + # multiline support is installed. In that case, the symbol will be replaced by a newline + # during matching. patterns: ([.value.text_matches[] | .. | .text? | select(type=="string") | sub("^\\\s+"; "") | sub("\\\s+$"; "") | gsub("\n"; "\u2424")] as $patterns_array | if $patterns_array == [] then "__NoPatternFound__" else $patterns_array | unique | join("\u001F") end) @@ -705,17 +716,21 @@ EOF IFS=$'\x1F' read -ra pattern_array <<<"$patterns" grep_args=("--color=never" "--line-number" "--text") - if command -v pcregrep >/dev/null && + if [[ -n $multiline_grep_executable ]] && command grep --quiet --max-count=1 --fixed-strings '␤' <<<"$patterns"; then - # The API sometimes returns a newline character as part of a pattern - # especially for texts where lots of Chinese characters are used, the - # character is replaced with '␤' inside the jq query, if such a pattern - # contains '␤' replace it with '\n' and try it with pcregrep multiline. + # The API sometimes returns a newline character as part of a pattern. The + # character is replaced with '␤' inside the jq query. If such a pattern + # contains '␤', replace it with '\n' and try using a tool that supports + # multiline, as this may be a bug in the API. These scenarios occur more + # frequently when the surrounding text contains many non-ASCII characters. + + # TODO: Read up on how the GitHub Search API works and provide a better + # explanation of why some patterns contain newline characters. for pattern in "${pattern_array[@]}"; do sanitized_patterns=$(command sed 's/[][?*+.$^(){}]/\\&/g' <<<"$pattern") grep_args+=("--regexp=${sanitized_patterns//␤/\\n}") done - command pcregrep --multiline "${grep_args[@]}" -- \ + command "$multiline_grep_executable" --multiline "${grep_args[@]}" -- \ "${store_file_contents}_${index}_fetched" 2>"${redirect_location}" | command cut -d: -f1 >>"${store_file_contents}_${index}_line_numbers" else diff --git a/readme.md b/readme.md index 36e780e..994164e 100644 --- a/readme.md +++ b/readme.md @@ -189,10 +189,10 @@ GHFC_HISTORY_FILE="/custom/location/history.txt" gh find-code GHFC_HISTORY_LIMIT="1000" gh find-code ``` -### pcregrep -- If the API returns patterns with newline characters, `pcregrep` will be used to find line numbers - if installed; otherwise, `grep` will be used by default, which may not match patterns containing - newlines. +### Pattern Matching +- In rare cases, when the API returns patterns with newline characters, `pcre2grep`, `pcregrep`, or + `rg` will be used to find line numbers if any of them is installed. Otherwise, `grep` will be used + by default, which will not match patterns containing newlines. ---