Skip to content

Commit

Permalink
Merge pull request borgbackup#8436 from ThomasWaldmann/analyze-cmd
Browse files Browse the repository at this point in the history
analyze: changed chunks per directory
  • Loading branch information
ThomasWaldmann authored Oct 2, 2024
2 parents 5a87b41 + de439ee commit 8cd951f
Show file tree
Hide file tree
Showing 7 changed files with 356 additions and 0 deletions.
91 changes: 91 additions & 0 deletions docs/man/borg-analyze.1
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
.\" Man page generated from reStructuredText.
.
.
.nr rst2man-indent-level 0
.
.de1 rstReportMargin
\\$1 \\n[an-margin]
level \\n[rst2man-indent-level]
level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
-
\\n[rst2man-indent0]
\\n[rst2man-indent1]
\\n[rst2man-indent2]
..
.de1 INDENT
.\" .rstReportMargin pre:
. RS \\$1
. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
. nr rst2man-indent-level +1
.\" .rstReportMargin post:
..
.de UNINDENT
. RE
.\" indent \\n[an-margin]
.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
.nr rst2man-indent-level -1
.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
..
.TH "BORG-ANALYZE" 1 "2024-10-02" "" "borg backup tool"
.SH NAME
borg-analyze \- Analyze archives
.SH SYNOPSIS
.sp
borg [common options] analyze [options]
.SH DESCRIPTION
.sp
Analyze archives to find \(dqhot spots\(dq.
.sp
Borg analyze relies on the usual archive matching options to select the
archives that should be considered for analysis (e.g. \fB\-a series_name\fP).
Then it iterates over all matching archives, over all contained files and
collects information about chunks stored in all directories it encountered.
.sp
It considers chunk IDs and their plaintext sizes (we don\(aqt have the compressed
size in the repository easily available) and adds up added/removed chunks\(aq
sizes per direct parent directory and outputs a list of \(dqdirectory: size\(dq.
.sp
You can use that list to find directories with a lot of \(dqactivity\(dq \- maybe
some of these are temporary or cache directories you did forget to exclude.
.sp
To not have these unwanted directories in your backups, you could carefully
exclude these in \fBborg create\fP (for future backups) or use \fBborg recreate\fP
to re\-create existing archives without these.
.SH OPTIONS
.sp
See \fIborg\-common(1)\fP for common options of Borg commands.
.SS Archive filters
.INDENT 0.0
.TP
.BI \-a \ PATTERN\fR,\fB \ \-\-match\-archives \ PATTERN
only consider archives matching all patterns. see \(dqborg help match\-archives\(dq.
.TP
.BI \-\-sort\-by \ KEYS
Comma\-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp
.TP
.BI \-\-first \ N
consider first N archives after other filters were applied
.TP
.BI \-\-last \ N
consider last N archives after other filters were applied
.TP
.BI \-\-oldest \ TIMESPAN
consider archives between the oldest archive\(aqs timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.
.TP
.BI \-\-newest \ TIMESPAN
consider archives between the newest archive\(aqs timestamp and (newest \- TIMESPAN), e.g. 7d or 12m.
.TP
.BI \-\-older \ TIMESPAN
consider archives older than (now \- TIMESPAN), e.g. 7d or 12m.
.TP
.BI \-\-newer \ TIMESPAN
consider archives newer than (now \- TIMESPAN), e.g. 7d or 12m.
.UNINDENT
.SH SEE ALSO
.sp
\fIborg\-common(1)\fP
.SH AUTHOR
The Borg Collective
.\" Generated by docutils manpage writer.
.
1 change: 1 addition & 0 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ Usage
usage/delete
usage/prune
usage/info
usage/analyze
usage/mount
usage/recreate
usage/tar
Expand Down
1 change: 1 addition & 0 deletions docs/usage/analyze.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.. include:: analyze.rst.inc
84 changes: 84 additions & 0 deletions docs/usage/analyze.rst.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
.. IMPORTANT: this file is auto-generated from borg's built-in help, do not edit!

.. _borg_analyze:

borg analyze
------------
.. code-block:: none

borg [common options] analyze [options]

.. only:: html

.. class:: borg-options-table

+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| .. class:: borg-common-opt-ref |
| |
| :ref:`common_options` |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| **Archive filters** — Archive filters can be applied to repository targets. |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| | ``-a PATTERN``, ``--match-archives PATTERN`` | only consider archives matching all patterns. see "borg help match-archives". |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| | ``--sort-by KEYS`` | Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| | ``--first N`` | consider first N archives after other filters were applied |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| | ``--last N`` | consider last N archives after other filters were applied |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| | ``--oldest TIMESPAN`` | consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m. |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| | ``--newest TIMESPAN`` | consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m. |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| | ``--older TIMESPAN`` | consider archives older than (now - TIMESPAN), e.g. 7d or 12m. |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| | ``--newer TIMESPAN`` | consider archives newer than (now - TIMESPAN), e.g. 7d or 12m. |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+

.. raw:: html

<script type='text/javascript'>
$(document).ready(function () {
$('.borg-options-table colgroup').remove();
})
</script>

.. only:: latex



:ref:`common_options`
|

Archive filters
-a PATTERN, --match-archives PATTERN only consider archives matching all patterns. see "borg help match-archives".
--sort-by KEYS Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp
--first N consider first N archives after other filters were applied
--last N consider last N archives after other filters were applied
--oldest TIMESPAN consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.
--newest TIMESPAN consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m.
--older TIMESPAN consider archives older than (now - TIMESPAN), e.g. 7d or 12m.
--newer TIMESPAN consider archives newer than (now - TIMESPAN), e.g. 7d or 12m.


Description
~~~~~~~~~~~

Analyze archives to find "hot spots".

Borg analyze relies on the usual archive matching options to select the
archives that should be considered for analysis (e.g. ``-a series_name``).
Then it iterates over all matching archives, over all contained files and
collects information about chunks stored in all directories it encountered.

It considers chunk IDs and their plaintext sizes (we don't have the compressed
size in the repository easily available) and adds up added/removed chunks'
sizes per direct parent directory and outputs a list of "directory: size".

You can use that list to find directories with a lot of "activity" - maybe
some of these are temporary or cache directories you did forget to exclude.

To not have these unwanted directories in your backups, you could carefully
exclude these in ``borg create`` (for future backups) or use ``borg recreate``
to re-create existing archives without these.
3 changes: 3 additions & 0 deletions src/borg/archiver/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def get_func(args):
raise Exception("expected func attributes not found")


from .analyze_cmd import AnalyzeMixIn
from .benchmark_cmd import BenchmarkMixIn
from .check_cmd import CheckMixIn
from .compact_cmd import CompactMixIn
Expand Down Expand Up @@ -94,6 +95,7 @@ def get_func(args):


class Archiver(
AnalyzeMixIn,
BenchmarkMixIn,
CheckMixIn,
CompactMixIn,
Expand Down Expand Up @@ -332,6 +334,7 @@ def build_parser(self):

subparsers = parser.add_subparsers(title="required arguments", metavar="<command>")

self.build_parser_analyze(subparsers, common_parser, mid_common_parser)
self.build_parser_benchmarks(subparsers, common_parser, mid_common_parser)
self.build_parser_check(subparsers, common_parser, mid_common_parser)
self.build_parser_compact(subparsers, common_parser, mid_common_parser)
Expand Down
135 changes: 135 additions & 0 deletions src/borg/archiver/analyze_cmd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import argparse
from collections import defaultdict
import os

from ._common import with_repository, define_archive_filters_group
from ..archive import Archive
from ..constants import * # NOQA
from ..helpers import bin_to_hex, Error
from ..helpers import ProgressIndicatorPercent
from ..manifest import Manifest
from ..remote import RemoteRepository
from ..repository import Repository

from ..logger import create_logger

logger = create_logger()


class ArchiveAnalyzer:
def __init__(self, args, repository, manifest):
self.args = args
self.repository = repository
assert isinstance(repository, (Repository, RemoteRepository))
self.manifest = manifest
self.difference_by_path = defaultdict(int) # directory path -> count of chunks changed

def analyze(self):
logger.info("Starting archives analysis...")
self.analyze_archives()
self.report()
logger.info("Finished archives analysis.")

def analyze_archives(self) -> None:
"""Analyze all archives matching the given selection criteria."""
archive_infos = self.manifest.archives.list_considering(self.args)
num_archives = len(archive_infos)
if num_archives < 2:
raise Error("Need at least 2 archives to analyze.")

pi = ProgressIndicatorPercent(
total=num_archives, msg="Analyzing archives %3.1f%%", step=0.1, msgid="analyze.analyze_archives"
)
i = 0
info = archive_infos[i]
pi.show(i)
logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 1}/{num_archives})")
base = self.analyze_archive(info.id)
for i, info in enumerate(archive_infos[1:]):
pi.show(i + 1)
logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 2}/{num_archives})")
new = self.analyze_archive(info.id)
self.analyze_change(base, new)
base = new
pi.finish()

def analyze_archive(self, id):
"""compute the set of chunks for each directory in this archive"""
archive = Archive(self.manifest, id)
chunks_by_path = defaultdict(dict) # collect all chunk IDs generated from files in this directory path
for item in archive.iter_items():
if "chunks" in item:
item_chunks = dict(item.chunks) # chunk id -> plaintext size
directory_path = os.path.dirname(item.path)
chunks_by_path[directory_path].update(item_chunks)
return chunks_by_path

def analyze_change(self, base, new):
"""for each directory path, sum up the changed (removed or added) chunks' sizes between base and new."""

def analyze_path_change(path):
base_chunks = base[path]
new_chunks = new[path]
# add up added chunks' sizes
for id in new_chunks.keys() - base_chunks.keys():
self.difference_by_path[directory_path] += new_chunks[id]
# add up removed chunks' sizes
for id in base_chunks.keys() - new_chunks.keys():
self.difference_by_path[directory_path] += base_chunks[id]

for directory_path in base:
analyze_path_change(directory_path)
for directory_path in new:
if directory_path not in base:
analyze_path_change(directory_path)

def report(self):
print()
print("chunks added or removed by directory path")
print("=========================================")
for directory_path in sorted(self.difference_by_path, key=lambda p: self.difference_by_path[p], reverse=True):
difference = self.difference_by_path[directory_path]
print(f"{directory_path}: {difference}")


class AnalyzeMixIn:
@with_repository(compatibility=(Manifest.Operation.READ,))
def do_analyze(self, args, repository, manifest):
"""Analyze archives"""
ArchiveAnalyzer(args, repository, manifest).analyze()

def build_parser_analyze(self, subparsers, common_parser, mid_common_parser):
from ._common import process_epilog

analyze_epilog = process_epilog(
"""
Analyze archives to find "hot spots".
Borg analyze relies on the usual archive matching options to select the
archives that should be considered for analysis (e.g. ``-a series_name``).
Then it iterates over all matching archives, over all contained files and
collects information about chunks stored in all directories it encountered.
It considers chunk IDs and their plaintext sizes (we don't have the compressed
size in the repository easily available) and adds up added/removed chunks'
sizes per direct parent directory and outputs a list of "directory: size".
You can use that list to find directories with a lot of "activity" - maybe
some of these are temporary or cache directories you did forget to exclude.
To not have these unwanted directories in your backups, you could carefully
exclude these in ``borg create`` (for future backups) or use ``borg recreate``
to re-create existing archives without these.
"""
)
subparser = subparsers.add_parser(
"analyze",
parents=[common_parser],
add_help=False,
description=self.do_analyze.__doc__,
epilog=analyze_epilog,
formatter_class=argparse.RawDescriptionHelpFormatter,
help="analyze archives",
)
subparser.set_defaults(func=self.do_analyze)
define_archive_filters_group(subparser)
41 changes: 41 additions & 0 deletions src/borg/testsuite/archiver/analyze_cmd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pathlib

from ...constants import * # NOQA
from . import cmd, generate_archiver_tests, RK_ENCRYPTION

pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local") # NOQA


def test_analyze(archivers, request):
def create_archive():
cmd(archiver, "create", "archive", archiver.input_path)

def analyze_archives():
return cmd(archiver, "analyze", "-a", "archive")

archiver = request.getfixturevalue(archivers)

cmd(archiver, "repo-create", RK_ENCRYPTION)
input_path = pathlib.Path(archiver.input_path)

# 1st archive
(input_path / "file1").write_text("1")
create_archive()

# 2nd archive
(input_path / "file2").write_text("22")
create_archive()

assert "/input: 2" in analyze_archives() # 2nd archive added 1 chunk for input path

# 3rd archive
(input_path / "file3").write_text("333")
create_archive()

assert "/input: 5" in analyze_archives() # 2nd/3rd archives added 2 chunks for input path

# 4th archive
(input_path / "file2").unlink()
create_archive()

assert "/input: 7" in analyze_archives() # 2nd/3rd archives added 2, 4th archive removed 1

0 comments on commit 8cd951f

Please sign in to comment.