Merge pull request borgbackup#8436 from ThomasWaldmann/analyze-cmd

analyze: changed chunks per directory
ThomasWaldmann · Oct 2, 2024 · 8cd951f · 8cd951f
2 parents 5a87b41 + de439ee
commit 8cd951f
Show file tree

Hide file tree

Showing 7 changed files with 356 additions and 0 deletions.
diff --git a/docs/man/borg-analyze.1 b/docs/man/borg-analyze.1
@@ -0,0 +1,91 @@
+.\" Man page generated from reStructuredText.
+.
+.
+.nr rst2man-indent-level 0
+.
+.de1 rstReportMargin
+\\$1 \\n[an-margin]
+level \\n[rst2man-indent-level]
+level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
+-
+\\n[rst2man-indent0]
+\\n[rst2man-indent1]
+\\n[rst2man-indent2]
+..
+.de1 INDENT
+.\" .rstReportMargin pre:
+. RS \\$1
+. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
+. nr rst2man-indent-level +1
+.\" .rstReportMargin post:
+..
+.de UNINDENT
+. RE
+.\" indent \\n[an-margin]
+.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
+.nr rst2man-indent-level -1
+.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
+.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
+..
+.TH "BORG-ANALYZE" 1 "2024-10-02" "" "borg backup tool"
+.SH NAME
+borg-analyze \- Analyze archives
+.SH SYNOPSIS
+.sp
+borg [common options] analyze [options]
+.SH DESCRIPTION
+.sp
+Analyze archives to find \(dqhot spots\(dq.
+.sp
+Borg analyze relies on the usual archive matching options to select the
+archives that should be considered for analysis (e.g. \fB\-a series_name\fP).
+Then it iterates over all matching archives, over all contained files and
+collects information about chunks stored in all directories it encountered.
+.sp
+It considers chunk IDs and their plaintext sizes (we don\(aqt have the compressed
+size in the repository easily available) and adds up added/removed chunks\(aq
+sizes per direct parent directory and outputs a list of \(dqdirectory: size\(dq.
+.sp
+You can use that list to find directories with a lot of \(dqactivity\(dq \- maybe
+some of these are temporary or cache directories you did forget to exclude.
+.sp
+To not have these unwanted directories in your backups, you could carefully
+exclude these in \fBborg create\fP (for future backups) or use \fBborg recreate\fP
+to re\-create existing archives without these.
+.SH OPTIONS
+.sp
+See \fIborg\-common(1)\fP for common options of Borg commands.
+.SS Archive filters
+.INDENT 0.0
+.TP
+.BI \-a \ PATTERN\fR,\fB \ \-\-match\-archives \ PATTERN
+only consider archives matching all patterns. see \(dqborg help match\-archives\(dq.
+.TP
+.BI \-\-sort\-by \ KEYS
+Comma\-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp
+.TP
+.BI \-\-first \ N
+consider first N archives after other filters were applied
+.TP
+.BI \-\-last \ N
+consider last N archives after other filters were applied
+.TP
+.BI \-\-oldest \ TIMESPAN
+consider archives between the oldest archive\(aqs timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.
+.TP
+.BI \-\-newest \ TIMESPAN
+consider archives between the newest archive\(aqs timestamp and (newest \- TIMESPAN), e.g. 7d or 12m.
+.TP
+.BI \-\-older \ TIMESPAN
+consider archives older than (now \- TIMESPAN), e.g. 7d or 12m.
+.TP
+.BI \-\-newer \ TIMESPAN
+consider archives newer than (now \- TIMESPAN), e.g. 7d or 12m.
+.UNINDENT
+.SH SEE ALSO
+.sp
+\fIborg\-common(1)\fP
+.SH AUTHOR
+The Borg Collective
+.\" Generated by docutils manpage writer.
+.
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -57,6 +57,7 @@ Usage
    usage/delete
    usage/prune
    usage/info
+   usage/analyze
    usage/mount
    usage/recreate
    usage/tar

diff --git a/docs/usage/analyze.rst b/docs/usage/analyze.rst
@@ -0,0 +1 @@
+.. include:: analyze.rst.inc
diff --git a/docs/usage/analyze.rst.inc b/docs/usage/analyze.rst.inc
@@ -0,0 +1,84 @@
+.. IMPORTANT: this file is auto-generated from borg's built-in help, do not edit!
+
+.. _borg_analyze:
+
+borg analyze
+------------
+.. code-block:: none
+
+    borg [common options] analyze [options]
+
+.. only:: html
+
+    .. class:: borg-options-table
+
+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+    | .. class:: borg-common-opt-ref                                                                                                                                                                                                                           |
+    |                                                                                                                                                                                                                                                          |
+    | :ref:`common_options`                                                                                                                                                                                                                                    |
+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+    | **Archive filters** — Archive filters can be applied to repository targets.                                                                                                                                                                              |
+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+    |                                                                             | ``-a PATTERN``, ``--match-archives PATTERN`` | only consider archives matching all patterns. see "borg help match-archives".                                               |
+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+    |                                                                             | ``--sort-by KEYS``                           | Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp |
+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+    |                                                                             | ``--first N``                                | consider first N archives after other filters were applied                                                                  |
+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+    |                                                                             | ``--last N``                                 | consider last N archives after other filters were applied                                                                   |
+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+    |                                                                             | ``--oldest TIMESPAN``                        | consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.                           |
+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+    |                                                                             | ``--newest TIMESPAN``                        | consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m.                           |
+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+    |                                                                             | ``--older TIMESPAN``                         | consider archives older than (now - TIMESPAN), e.g. 7d or 12m.                                                              |
+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+    |                                                                             | ``--newer TIMESPAN``                         | consider archives newer than (now - TIMESPAN), e.g. 7d or 12m.                                                              |
+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+
+    .. raw:: html
+
+        <script type='text/javascript'>
+        $(document).ready(function () {
+            $('.borg-options-table colgroup').remove();
+        })
+        </script>
+
+.. only:: latex
+
+
+
+    :ref:`common_options`
+        |
+
+    Archive filters
+        -a PATTERN, --match-archives PATTERN     only consider archives matching all patterns. see "borg help match-archives".
+        --sort-by KEYS                           Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp
+        --first N                                consider first N archives after other filters were applied
+        --last N                                 consider last N archives after other filters were applied
+        --oldest TIMESPAN                        consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.
+        --newest TIMESPAN                        consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m.
+        --older TIMESPAN                         consider archives older than (now - TIMESPAN), e.g. 7d or 12m.
+        --newer TIMESPAN                         consider archives newer than (now - TIMESPAN), e.g. 7d or 12m.
+
+
+Description
+~~~~~~~~~~~
+
+Analyze archives to find "hot spots".
+
+Borg analyze relies on the usual archive matching options to select the
+archives that should be considered for analysis (e.g. ``-a series_name``).
+Then it iterates over all matching archives, over all contained files and
+collects information about chunks stored in all directories it encountered.
+
+It considers chunk IDs and their plaintext sizes (we don't have the compressed
+size in the repository easily available) and adds up added/removed chunks'
+sizes per direct parent directory and outputs a list of "directory: size".
+
+You can use that list to find directories with a lot of "activity" - maybe
+some of these are temporary or cache directories you did forget to exclude.
+
+To not have these unwanted directories in your backups, you could carefully
+exclude these in ``borg create`` (for future backups) or use ``borg recreate``
+to re-create existing archives without these.
diff --git a/src/borg/archiver/__init__.py b/src/borg/archiver/__init__.py
@@ -64,6 +64,7 @@ def get_func(args):
     raise Exception("expected func attributes not found")
 
 
+from .analyze_cmd import AnalyzeMixIn
 from .benchmark_cmd import BenchmarkMixIn
 from .check_cmd import CheckMixIn
 from .compact_cmd import CompactMixIn
@@ -94,6 +95,7 @@ def get_func(args):
 
 
 class Archiver(
+    AnalyzeMixIn,
     BenchmarkMixIn,
     CheckMixIn,
     CompactMixIn,
@@ -332,6 +334,7 @@ def build_parser(self):
 
         subparsers = parser.add_subparsers(title="required arguments", metavar="<command>")
 
+        self.build_parser_analyze(subparsers, common_parser, mid_common_parser)
         self.build_parser_benchmarks(subparsers, common_parser, mid_common_parser)
         self.build_parser_check(subparsers, common_parser, mid_common_parser)
         self.build_parser_compact(subparsers, common_parser, mid_common_parser)

diff --git a/src/borg/archiver/analyze_cmd.py b/src/borg/archiver/analyze_cmd.py
@@ -0,0 +1,135 @@
+import argparse
+from collections import defaultdict
+import os
+
+from ._common import with_repository, define_archive_filters_group
+from ..archive import Archive
+from ..constants import *  # NOQA
+from ..helpers import bin_to_hex, Error
+from ..helpers import ProgressIndicatorPercent
+from ..manifest import Manifest
+from ..remote import RemoteRepository
+from ..repository import Repository
+
+from ..logger import create_logger
+
+logger = create_logger()
+
+
+class ArchiveAnalyzer:
+    def __init__(self, args, repository, manifest):
+        self.args = args
+        self.repository = repository
+        assert isinstance(repository, (Repository, RemoteRepository))
+        self.manifest = manifest
+        self.difference_by_path = defaultdict(int)  # directory path -> count of chunks changed
+
+    def analyze(self):
+        logger.info("Starting archives analysis...")
+        self.analyze_archives()
+        self.report()
+        logger.info("Finished archives analysis.")
+
+    def analyze_archives(self) -> None:
+        """Analyze all archives matching the given selection criteria."""
+        archive_infos = self.manifest.archives.list_considering(self.args)
+        num_archives = len(archive_infos)
+        if num_archives < 2:
+            raise Error("Need at least 2 archives to analyze.")
+
+        pi = ProgressIndicatorPercent(
+            total=num_archives, msg="Analyzing archives %3.1f%%", step=0.1, msgid="analyze.analyze_archives"
+        )
+        i = 0
+        info = archive_infos[i]
+        pi.show(i)
+        logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 1}/{num_archives})")
+        base = self.analyze_archive(info.id)
+        for i, info in enumerate(archive_infos[1:]):
+            pi.show(i + 1)
+            logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 2}/{num_archives})")
+            new = self.analyze_archive(info.id)
+            self.analyze_change(base, new)
+            base = new
+        pi.finish()
+
+    def analyze_archive(self, id):
+        """compute the set of chunks for each directory in this archive"""
+        archive = Archive(self.manifest, id)
+        chunks_by_path = defaultdict(dict)  # collect all chunk IDs generated from files in this directory path
+        for item in archive.iter_items():
+            if "chunks" in item:
+                item_chunks = dict(item.chunks)  # chunk id -> plaintext size
+                directory_path = os.path.dirname(item.path)
+                chunks_by_path[directory_path].update(item_chunks)
+        return chunks_by_path
+
+    def analyze_change(self, base, new):
+        """for each directory path, sum up the changed (removed or added) chunks' sizes between base and new."""
+
+        def analyze_path_change(path):
+            base_chunks = base[path]
+            new_chunks = new[path]
+            # add up added chunks' sizes
+            for id in new_chunks.keys() - base_chunks.keys():
+                self.difference_by_path[directory_path] += new_chunks[id]
+            # add up removed chunks' sizes
+            for id in base_chunks.keys() - new_chunks.keys():
+                self.difference_by_path[directory_path] += base_chunks[id]
+
+        for directory_path in base:
+            analyze_path_change(directory_path)
+        for directory_path in new:
+            if directory_path not in base:
+                analyze_path_change(directory_path)
+
+    def report(self):
+        print()
+        print("chunks added or removed by directory path")
+        print("=========================================")
+        for directory_path in sorted(self.difference_by_path, key=lambda p: self.difference_by_path[p], reverse=True):
+            difference = self.difference_by_path[directory_path]
+            print(f"{directory_path}: {difference}")
+
+
+class AnalyzeMixIn:
+    @with_repository(compatibility=(Manifest.Operation.READ,))
+    def do_analyze(self, args, repository, manifest):
+        """Analyze archives"""
+        ArchiveAnalyzer(args, repository, manifest).analyze()
+
+    def build_parser_analyze(self, subparsers, common_parser, mid_common_parser):
+        from ._common import process_epilog
+
+        analyze_epilog = process_epilog(
+            """
+            Analyze archives to find "hot spots".
+
+            Borg analyze relies on the usual archive matching options to select the
+            archives that should be considered for analysis (e.g. ``-a series_name``).
+            Then it iterates over all matching archives, over all contained files and
+            collects information about chunks stored in all directories it encountered.
+
+            It considers chunk IDs and their plaintext sizes (we don't have the compressed
+            size in the repository easily available) and adds up added/removed chunks'
+            sizes per direct parent directory and outputs a list of "directory: size".
+
+            You can use that list to find directories with a lot of "activity" - maybe
+            some of these are temporary or cache directories you did forget to exclude.
+
+            To not have these unwanted directories in your backups, you could carefully
+            exclude these in ``borg create`` (for future backups) or use ``borg recreate``
+            to re-create existing archives without these.
+            """
+        )
+        subparser = subparsers.add_parser(
+            "analyze",
+            parents=[common_parser],
+            add_help=False,
+            description=self.do_analyze.__doc__,
+            epilog=analyze_epilog,
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            help="analyze archives",
+        )
+        subparser.set_defaults(func=self.do_analyze)
+        define_archive_filters_group(subparser)
diff --git a/src/borg/testsuite/archiver/analyze_cmd.py b/src/borg/testsuite/archiver/analyze_cmd.py
@@ -0,0 +1,41 @@
+import pathlib
+
+from ...constants import *  # NOQA
+from . import cmd, generate_archiver_tests, RK_ENCRYPTION
+
+pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local")  # NOQA
+
+
+def test_analyze(archivers, request):
+    def create_archive():
+        cmd(archiver, "create", "archive", archiver.input_path)
+
+    def analyze_archives():
+        return cmd(archiver, "analyze", "-a", "archive")
+
+    archiver = request.getfixturevalue(archivers)
+
+    cmd(archiver, "repo-create", RK_ENCRYPTION)
+    input_path = pathlib.Path(archiver.input_path)
+
+    # 1st archive
+    (input_path / "file1").write_text("1")
+    create_archive()
+
+    # 2nd archive
+    (input_path / "file2").write_text("22")
+    create_archive()
+
+    assert "/input: 2" in analyze_archives()  # 2nd archive added 1 chunk for input path
+
+    # 3rd archive
+    (input_path / "file3").write_text("333")
+    create_archive()
+
+    assert "/input: 5" in analyze_archives()  # 2nd/3rd archives added 2 chunks for input path
+
+    # 4th archive
+    (input_path / "file2").unlink()
+    create_archive()
+
+    assert "/input: 7" in analyze_archives()  # 2nd/3rd archives added 2, 4th archive removed 1