forked from borgbackup/borg
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request borgbackup#8436 from ThomasWaldmann/analyze-cmd
analyze: changed chunks per directory
- Loading branch information
Showing
7 changed files
with
356 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
.\" Man page generated from reStructuredText. | ||
. | ||
. | ||
.nr rst2man-indent-level 0 | ||
. | ||
.de1 rstReportMargin | ||
\\$1 \\n[an-margin] | ||
level \\n[rst2man-indent-level] | ||
level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] | ||
- | ||
\\n[rst2man-indent0] | ||
\\n[rst2man-indent1] | ||
\\n[rst2man-indent2] | ||
.. | ||
.de1 INDENT | ||
.\" .rstReportMargin pre: | ||
. RS \\$1 | ||
. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] | ||
. nr rst2man-indent-level +1 | ||
.\" .rstReportMargin post: | ||
.. | ||
.de UNINDENT | ||
. RE | ||
.\" indent \\n[an-margin] | ||
.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] | ||
.nr rst2man-indent-level -1 | ||
.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] | ||
.in \\n[rst2man-indent\\n[rst2man-indent-level]]u | ||
.. | ||
.TH "BORG-ANALYZE" 1 "2024-10-02" "" "borg backup tool" | ||
.SH NAME | ||
borg-analyze \- Analyze archives | ||
.SH SYNOPSIS | ||
.sp | ||
borg [common options] analyze [options] | ||
.SH DESCRIPTION | ||
.sp | ||
Analyze archives to find \(dqhot spots\(dq. | ||
.sp | ||
Borg analyze relies on the usual archive matching options to select the | ||
archives that should be considered for analysis (e.g. \fB\-a series_name\fP). | ||
Then it iterates over all matching archives, over all contained files and | ||
collects information about chunks stored in all directories it encountered. | ||
.sp | ||
It considers chunk IDs and their plaintext sizes (we don\(aqt have the compressed | ||
size in the repository easily available) and adds up added/removed chunks\(aq | ||
sizes per direct parent directory and outputs a list of \(dqdirectory: size\(dq. | ||
.sp | ||
You can use that list to find directories with a lot of \(dqactivity\(dq \- maybe | ||
some of these are temporary or cache directories you did forget to exclude. | ||
.sp | ||
To not have these unwanted directories in your backups, you could carefully | ||
exclude these in \fBborg create\fP (for future backups) or use \fBborg recreate\fP | ||
to re\-create existing archives without these. | ||
.SH OPTIONS | ||
.sp | ||
See \fIborg\-common(1)\fP for common options of Borg commands. | ||
.SS Archive filters | ||
.INDENT 0.0 | ||
.TP | ||
.BI \-a \ PATTERN\fR,\fB \ \-\-match\-archives \ PATTERN | ||
only consider archives matching all patterns. see \(dqborg help match\-archives\(dq. | ||
.TP | ||
.BI \-\-sort\-by \ KEYS | ||
Comma\-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp | ||
.TP | ||
.BI \-\-first \ N | ||
consider first N archives after other filters were applied | ||
.TP | ||
.BI \-\-last \ N | ||
consider last N archives after other filters were applied | ||
.TP | ||
.BI \-\-oldest \ TIMESPAN | ||
consider archives between the oldest archive\(aqs timestamp and (oldest + TIMESPAN), e.g. 7d or 12m. | ||
.TP | ||
.BI \-\-newest \ TIMESPAN | ||
consider archives between the newest archive\(aqs timestamp and (newest \- TIMESPAN), e.g. 7d or 12m. | ||
.TP | ||
.BI \-\-older \ TIMESPAN | ||
consider archives older than (now \- TIMESPAN), e.g. 7d or 12m. | ||
.TP | ||
.BI \-\-newer \ TIMESPAN | ||
consider archives newer than (now \- TIMESPAN), e.g. 7d or 12m. | ||
.UNINDENT | ||
.SH SEE ALSO | ||
.sp | ||
\fIborg\-common(1)\fP | ||
.SH AUTHOR | ||
The Borg Collective | ||
.\" Generated by docutils manpage writer. | ||
. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,6 +57,7 @@ Usage | |
usage/delete | ||
usage/prune | ||
usage/info | ||
usage/analyze | ||
usage/mount | ||
usage/recreate | ||
usage/tar | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
.. include:: analyze.rst.inc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
.. IMPORTANT: this file is auto-generated from borg's built-in help, do not edit! | ||
|
||
.. _borg_analyze: | ||
|
||
borg analyze | ||
------------ | ||
.. code-block:: none | ||
|
||
borg [common options] analyze [options] | ||
|
||
.. only:: html | ||
|
||
.. class:: borg-options-table | ||
|
||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ | ||
| .. class:: borg-common-opt-ref | | ||
| | | ||
| :ref:`common_options` | | ||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ | ||
| **Archive filters** — Archive filters can be applied to repository targets. | | ||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ | ||
| | ``-a PATTERN``, ``--match-archives PATTERN`` | only consider archives matching all patterns. see "borg help match-archives". | | ||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ | ||
| | ``--sort-by KEYS`` | Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp | | ||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ | ||
| | ``--first N`` | consider first N archives after other filters were applied | | ||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ | ||
| | ``--last N`` | consider last N archives after other filters were applied | | ||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ | ||
| | ``--oldest TIMESPAN`` | consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m. | | ||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ | ||
| | ``--newest TIMESPAN`` | consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m. | | ||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ | ||
| | ``--older TIMESPAN`` | consider archives older than (now - TIMESPAN), e.g. 7d or 12m. | | ||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ | ||
| | ``--newer TIMESPAN`` | consider archives newer than (now - TIMESPAN), e.g. 7d or 12m. | | ||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ | ||
|
||
.. raw:: html | ||
|
||
<script type='text/javascript'> | ||
$(document).ready(function () { | ||
$('.borg-options-table colgroup').remove(); | ||
}) | ||
</script> | ||
|
||
.. only:: latex | ||
|
||
|
||
|
||
:ref:`common_options` | ||
| | ||
|
||
Archive filters | ||
-a PATTERN, --match-archives PATTERN only consider archives matching all patterns. see "borg help match-archives". | ||
--sort-by KEYS Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp | ||
--first N consider first N archives after other filters were applied | ||
--last N consider last N archives after other filters were applied | ||
--oldest TIMESPAN consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m. | ||
--newest TIMESPAN consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m. | ||
--older TIMESPAN consider archives older than (now - TIMESPAN), e.g. 7d or 12m. | ||
--newer TIMESPAN consider archives newer than (now - TIMESPAN), e.g. 7d or 12m. | ||
|
||
|
||
Description | ||
~~~~~~~~~~~ | ||
|
||
Analyze archives to find "hot spots". | ||
|
||
Borg analyze relies on the usual archive matching options to select the | ||
archives that should be considered for analysis (e.g. ``-a series_name``). | ||
Then it iterates over all matching archives, over all contained files and | ||
collects information about chunks stored in all directories it encountered. | ||
|
||
It considers chunk IDs and their plaintext sizes (we don't have the compressed | ||
size in the repository easily available) and adds up added/removed chunks' | ||
sizes per direct parent directory and outputs a list of "directory: size". | ||
|
||
You can use that list to find directories with a lot of "activity" - maybe | ||
some of these are temporary or cache directories you did forget to exclude. | ||
|
||
To not have these unwanted directories in your backups, you could carefully | ||
exclude these in ``borg create`` (for future backups) or use ``borg recreate`` | ||
to re-create existing archives without these. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
import argparse | ||
from collections import defaultdict | ||
import os | ||
|
||
from ._common import with_repository, define_archive_filters_group | ||
from ..archive import Archive | ||
from ..constants import * # NOQA | ||
from ..helpers import bin_to_hex, Error | ||
from ..helpers import ProgressIndicatorPercent | ||
from ..manifest import Manifest | ||
from ..remote import RemoteRepository | ||
from ..repository import Repository | ||
|
||
from ..logger import create_logger | ||
|
||
logger = create_logger() | ||
|
||
|
||
class ArchiveAnalyzer: | ||
def __init__(self, args, repository, manifest): | ||
self.args = args | ||
self.repository = repository | ||
assert isinstance(repository, (Repository, RemoteRepository)) | ||
self.manifest = manifest | ||
self.difference_by_path = defaultdict(int) # directory path -> count of chunks changed | ||
|
||
def analyze(self): | ||
logger.info("Starting archives analysis...") | ||
self.analyze_archives() | ||
self.report() | ||
logger.info("Finished archives analysis.") | ||
|
||
def analyze_archives(self) -> None: | ||
"""Analyze all archives matching the given selection criteria.""" | ||
archive_infos = self.manifest.archives.list_considering(self.args) | ||
num_archives = len(archive_infos) | ||
if num_archives < 2: | ||
raise Error("Need at least 2 archives to analyze.") | ||
|
||
pi = ProgressIndicatorPercent( | ||
total=num_archives, msg="Analyzing archives %3.1f%%", step=0.1, msgid="analyze.analyze_archives" | ||
) | ||
i = 0 | ||
info = archive_infos[i] | ||
pi.show(i) | ||
logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 1}/{num_archives})") | ||
base = self.analyze_archive(info.id) | ||
for i, info in enumerate(archive_infos[1:]): | ||
pi.show(i + 1) | ||
logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 2}/{num_archives})") | ||
new = self.analyze_archive(info.id) | ||
self.analyze_change(base, new) | ||
base = new | ||
pi.finish() | ||
|
||
def analyze_archive(self, id): | ||
"""compute the set of chunks for each directory in this archive""" | ||
archive = Archive(self.manifest, id) | ||
chunks_by_path = defaultdict(dict) # collect all chunk IDs generated from files in this directory path | ||
for item in archive.iter_items(): | ||
if "chunks" in item: | ||
item_chunks = dict(item.chunks) # chunk id -> plaintext size | ||
directory_path = os.path.dirname(item.path) | ||
chunks_by_path[directory_path].update(item_chunks) | ||
return chunks_by_path | ||
|
||
def analyze_change(self, base, new): | ||
"""for each directory path, sum up the changed (removed or added) chunks' sizes between base and new.""" | ||
|
||
def analyze_path_change(path): | ||
base_chunks = base[path] | ||
new_chunks = new[path] | ||
# add up added chunks' sizes | ||
for id in new_chunks.keys() - base_chunks.keys(): | ||
self.difference_by_path[directory_path] += new_chunks[id] | ||
# add up removed chunks' sizes | ||
for id in base_chunks.keys() - new_chunks.keys(): | ||
self.difference_by_path[directory_path] += base_chunks[id] | ||
|
||
for directory_path in base: | ||
analyze_path_change(directory_path) | ||
for directory_path in new: | ||
if directory_path not in base: | ||
analyze_path_change(directory_path) | ||
|
||
def report(self): | ||
print() | ||
print("chunks added or removed by directory path") | ||
print("=========================================") | ||
for directory_path in sorted(self.difference_by_path, key=lambda p: self.difference_by_path[p], reverse=True): | ||
difference = self.difference_by_path[directory_path] | ||
print(f"{directory_path}: {difference}") | ||
|
||
|
||
class AnalyzeMixIn: | ||
@with_repository(compatibility=(Manifest.Operation.READ,)) | ||
def do_analyze(self, args, repository, manifest): | ||
"""Analyze archives""" | ||
ArchiveAnalyzer(args, repository, manifest).analyze() | ||
|
||
def build_parser_analyze(self, subparsers, common_parser, mid_common_parser): | ||
from ._common import process_epilog | ||
|
||
analyze_epilog = process_epilog( | ||
""" | ||
Analyze archives to find "hot spots". | ||
Borg analyze relies on the usual archive matching options to select the | ||
archives that should be considered for analysis (e.g. ``-a series_name``). | ||
Then it iterates over all matching archives, over all contained files and | ||
collects information about chunks stored in all directories it encountered. | ||
It considers chunk IDs and their plaintext sizes (we don't have the compressed | ||
size in the repository easily available) and adds up added/removed chunks' | ||
sizes per direct parent directory and outputs a list of "directory: size". | ||
You can use that list to find directories with a lot of "activity" - maybe | ||
some of these are temporary or cache directories you did forget to exclude. | ||
To not have these unwanted directories in your backups, you could carefully | ||
exclude these in ``borg create`` (for future backups) or use ``borg recreate`` | ||
to re-create existing archives without these. | ||
""" | ||
) | ||
subparser = subparsers.add_parser( | ||
"analyze", | ||
parents=[common_parser], | ||
add_help=False, | ||
description=self.do_analyze.__doc__, | ||
epilog=analyze_epilog, | ||
formatter_class=argparse.RawDescriptionHelpFormatter, | ||
help="analyze archives", | ||
) | ||
subparser.set_defaults(func=self.do_analyze) | ||
define_archive_filters_group(subparser) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import pathlib | ||
|
||
from ...constants import * # NOQA | ||
from . import cmd, generate_archiver_tests, RK_ENCRYPTION | ||
|
||
pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local") # NOQA | ||
|
||
|
||
def test_analyze(archivers, request): | ||
def create_archive(): | ||
cmd(archiver, "create", "archive", archiver.input_path) | ||
|
||
def analyze_archives(): | ||
return cmd(archiver, "analyze", "-a", "archive") | ||
|
||
archiver = request.getfixturevalue(archivers) | ||
|
||
cmd(archiver, "repo-create", RK_ENCRYPTION) | ||
input_path = pathlib.Path(archiver.input_path) | ||
|
||
# 1st archive | ||
(input_path / "file1").write_text("1") | ||
create_archive() | ||
|
||
# 2nd archive | ||
(input_path / "file2").write_text("22") | ||
create_archive() | ||
|
||
assert "/input: 2" in analyze_archives() # 2nd archive added 1 chunk for input path | ||
|
||
# 3rd archive | ||
(input_path / "file3").write_text("333") | ||
create_archive() | ||
|
||
assert "/input: 5" in analyze_archives() # 2nd/3rd archives added 2 chunks for input path | ||
|
||
# 4th archive | ||
(input_path / "file2").unlink() | ||
create_archive() | ||
|
||
assert "/input: 7" in analyze_archives() # 2nd/3rd archives added 2, 4th archive removed 1 |