Skip to content

Commit

Permalink
Add change dbs dataset file status fix 5204 (#5243)
Browse files Browse the repository at this point in the history
* add setdataset.py for #5204

* refactor and add setfiles

* add Content-type arg to HTTPRequests

* setdataset to use contentType

* rename commands to setdatasetstatus setfilestatus

* add autocomplete

* list of LFNs not supported yet

* some pylint and pep8

* add logging for setfilestatus
  • Loading branch information
belforte authored Oct 26, 2023
1 parent 0bc1ad0 commit 330747d
Show file tree
Hide file tree
Showing 5 changed files with 362 additions and 9 deletions.
26 changes: 23 additions & 3 deletions etc/crab-bash-completion.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ _UseCrab ()
"")
case "$cur" in
"")
COMPREPLY=( $(compgen -W '--version --help -h --quiet --debug status tasks proceed checkwrite getlog checkusername checkdataset submit getoutput resubmit kill uploadlog remake report preparelocal createmyproxy' -- $cur) )
COMPREPLY=( $(compgen -W '--version --help -h --quiet --debug status tasks proceed checkwrite getlog checkusername checkdataset submit getoutput resubmit kill uploadlog remake report preparelocal createmyproxy setdatasetstatus setfilestatus' -- $cur) )
;;
-*)
COMPREPLY=( $(compgen -W '--version --help -h --quiet --debug' -- $cur) )
;;
*)
COMPREPLY=( $(compgen -W 'status tasks proceed checkwrite getlog checkusername checkdataset submit getoutput resubmit kill uploadlog remake report preparelocal createmyproxy' -- $cur) )
COMPREPLY=( $(compgen -W 'status tasks proceed checkwrite getlog checkusername checkdataset submit getoutput resubmit kill uploadlog remake report preparelocal createmyproxy setdatasetstatus setfilestatus' -- $cur) )
;;
esac
;;
Expand Down Expand Up @@ -284,9 +284,29 @@ _UseCrab ()
esac
;;

"setdatasetstatus")
case "$cur" in
-*)
COMPREPLY=( $(compgen -W '--help -h --status --dataset' -- $cur) )
;;
*)
COMPREPLY=( $(compgen -f $cur) )
esac
;;

"setfilestatus")
case "$cur" in
-*)
COMPREPLY=( $(compgen -W '--help -h --status --dataset --files' -- $cur) )
;;
*)
COMPREPLY=( $(compgen -f $cur) )
esac
;;


*)
COMPREPLY=( $(compgen -W 'status tasks proceed checkwrite getlog checkusername submit getoutput resubmit kill uploadlog remake report preparelocal' -- $cur) )
COMPREPLY=( $(compgen -W 'status tasks proceed checkwrite getlog checkusername submit getoutput resubmit kill uploadlog remake report preparelocal setdatasetstatus setfilestatus' -- $cur) )
;;
esac

Expand Down
2 changes: 2 additions & 0 deletions src/python/CRABClient/ClientMapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@
'checkusername' : {'acceptsArguments': False, 'requiresREST': False, 'requiresRucio': False, 'requiresDirOption': False, 'useCache': False, 'requiresProxyVOOptions': False, 'requiresLocalCache': False},
'checkwrite' : {'acceptsArguments': False, 'requiresREST': False, 'requiresRucio': True, 'requiresDirOption': False, 'useCache': False, 'requiresProxyVOOptions': True, 'requiresLocalCache': False},
'checkdataset' : {'acceptsArguments': False, 'requiresREST': False, 'requiresRucio': True, 'requiresDirOption': False, 'useCache': False, 'requiresProxyVOOptions': False, 'requiresLocalCache': False},
'setdatasetstatus' : {'acceptsArguments': False, 'requiresREST': False, 'requiresRucio': False, 'requiresDirOption': False, 'useCache': False, 'requiresProxyVOOptions': False, 'requiresLocalCache': False},
'setfilestatus' : {'acceptsArguments': False, 'requiresREST': False, 'requiresRucio': False, 'requiresDirOption': False, 'useCache': False, 'requiresProxyVOOptions': False, 'requiresLocalCache': False},
'getlog' : {'acceptsArguments': False, 'requiresREST': True, 'requiresRucio': False, 'requiresDirOption': True, 'useCache': True, 'requiresProxyVOOptions': True, 'requiresLocalCache': True },
'getoutput' : {'acceptsArguments': False, 'requiresREST': True, 'requiresRucio': True, 'requiresDirOption': True, 'useCache': True, 'requiresProxyVOOptions': True, 'requiresLocalCache': True },
'kill' : {'acceptsArguments': False, 'requiresREST': True, 'requiresRucio': False, 'requiresDirOption': True, 'useCache': False, 'requiresProxyVOOptions': False, 'requiresLocalCache': True },
Expand Down
167 changes: 167 additions & 0 deletions src/python/CRABClient/Commands/setdatasetstatus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
# pylint: disable=consider-using-f-string, unspecified-encoding
"""
allow users to (in)validate their own DBS USER datasets
"""

import sys
import json

from CRABClient.Commands.SubCommand import SubCommand
from CRABClient.ClientExceptions import MissingOptionException, ConfigurationException, CommandFailedException
from CRABClient.ClientUtilities import colors
from CRABClient.CrabRestInterface import HTTPRequests

if sys.version_info >= (3, 0):
from urllib.parse import urlencode # pylint: disable=E0611
if sys.version_info < (3, 0):
from urllib import urlencode

try:
from CRABClient import __version__
except: # pylint: disable=bare-except
__version__ = '0.0.0'


def getDbsREST(instance=None, logger=None, cert=None, key=None, version=None):
"""
given a DBS istance (e.g. prod/phys03) returns a DBSReader and DBSWriter
client instances which communicate with DBS REST via curl
Arguments:
logger: a logger
cert, key : name of files, can use the path to X509_USER_PROXY for both
version: the CRAB Client version to put in the User Agent field of the query
"""
# if user supplied a simple prod/phys03 like instance, these two lines will do
# note that our HTTPRequests will add https://
dbsReadUrl = "cmsweb.cern.ch:8443/dbs/" + instance + "/DBSReader/"
dbsWriteUrl = "cmsweb.cern.ch:8443/dbs/" + instance + "/DBSWriter/"
# a possible use case e.g. for testing is to use int instance of DBS. requires testbed CMSWEB
if instance.startswith('int'):
dbsReadUrl = dbsReadUrl.replace('cmsweb', 'cmsweb-testbed')
dbsWriteUrl = dbsWriteUrl.replace('cmsweb', 'cmsweb-testbed')
# if user knoww better and provided a full URL, we'll take and adapt
# to have both Reader and Writer,
if instance.startswith("https://"):
url = instance.lstrip("https://") # will be added back in HTTPRequests
if "DBSReader" in url:
dbsReadUrl = url
dbsWriteUrl = url.replace('DBSReader', 'DBSWriter')
elif 'DBSWriter' in url:
dbsWriteUrl = url
dbsReadUrl = url.replace('DBSWriter', 'DBSReader')
else:
raise ConfigurationException("bad instance value %s" % instance)

logger.debug('Read Url = %s' % dbsReadUrl)
logger.debug('Write Url = %s' % dbsWriteUrl)

dbsReader = HTTPRequests(hostname=dbsReadUrl, localcert=cert, localkey=key,
retry=2, logger=logger, verbose=False, contentType='application/json',
userAgent='CRABClient', version=version)

dbsWriter = HTTPRequests(hostname=dbsWriteUrl, localcert=cert, localkey=key,
retry=2, logger=logger, verbose=False, contentType='application/json',
userAgent='CRABClient', version=version)
return dbsReader, dbsWriter


class setdatasetstatus(SubCommand):
"""
Set status of a USER dataet in phys03,
optionally invalidates/revalidates all files in it
meant to replace https://github.com/dmwm/DBS/blob/master/Client/utils/DataOpsScripts/DBS3SetDatasetStatus.py
and to work whenever CRAB is supported, i.e. with both python2 and python3
"""

name = 'setdatasetstatus'

def __init__(self, logger, cmdargs=None):
SubCommand.__init__(self, logger, cmdargs)

def __call__(self):
result = 'FAILED' # will change to 'SUCCESS' when all is OK

instance = self.options.instance
dataset = self.options.dataset
status = self.options.status
recursive = self.options.recursive
self.logger.debug('instance = %s' % instance)
self.logger.debug('dataset = %s' % dataset)
self.logger.debug('status = %s' % status)
self.logger.debug('recursive = %s' % recursive)

if recursive:
self.logger.warning("ATTENTION: recursive option is not implemented yet. Ignoring it")

# from DBS instance, to DBS REST services
dbsReader, dbsWriter = getDbsREST(instance=instance, logger=self.logger,
cert=self.proxyfilename, key=self.proxyfilename,
version=__version__)

self.logger.info("looking up Dataset %s in DBS %s" % (dataset, instance))
datasetStatusQuery = {'dataset': dataset, 'dataset_access_type': '*', 'detail': True}
ds, rc, msg = dbsReader.get(uri="datasets", data=urlencode(datasetStatusQuery))
self.logger.debug('exitcode= %s', rc)
if not ds:
self.logger.error("ERROR: dataset %s not found in DBS" % dataset)
raise ConfigurationException
self.logger.info("Dataset status in DBS is %s" % ds[0]['dataset_access_type'])
self.logger.info("Will set it to %s" % status)
data = {'dataset': dataset, 'dataset_access_type': status}
jdata = json.dumps(data)
out, rc, msg = dbsWriter.put(uri='datasets', data=jdata)
if rc == 200 and msg == 'OK':
self.logger.info("Dataset status changed successfully")
result = 'SUCCESS'
else:
msg = "Dataset status change failed: %s" % out
raise CommandFailedException(msg)

ds, rc, msg = dbsReader.get(uri="datasets", data=urlencode(datasetStatusQuery))
self.logger.debug('exitcode= %s', rc)
self.logger.info("Dataset status in DBS now is %s" % ds[0]['dataset_access_type'])

return {'commandStatus': result}

def setOptions(self):
"""
__setOptions__
This allows to set specific command options
"""
self.parser.add_option('--instance', dest='instance', default='prod/phys03',
help="DBS instance. e.g. prod/phys03 (default) or int/phys03. Use at your own risk." + \
"Unless you really know what you are doing, stay with the default"
)
self.parser.add_option('--dataset', dest='dataset', default=None,
help='dataset name')
self.parser.add_option('--status', dest='status', default=None,
help="New status of the dataset: VALID/INVALID/DELETED/DEPRECATED",
choices=['VALID', 'INVALID', 'DELETED', 'DEPRECATED']
)
self.parser.add_option('--recursive', dest='recursive', default=False, action="store_true",
help="Apply status to children datasets and sets all files status in those" + \
"to VALID if status=VALID, INVALID otherwise"
)

def validateOptions(self):
SubCommand.validateOptions(self)

if self.options.dataset is None:
msg = "%sError%s: Please specify the dataset to check." % (colors.RED, colors.NORMAL)
msg += " Use the --dataset option."
ex = MissingOptionException(msg)
ex.missingOption = "dataset"
raise ex
if self.options.status is None:
msg = "%sError%s: Please specify the new dataset status." % (colors.RED, colors.NORMAL)
msg += " Use the --status option."
ex = MissingOptionException(msg)
ex.missingOption = "status"
raise ex
# minimal sanity check
instance = self.options.instance
if not '/' in instance or len(instance.split('/'))>2 and not instance.startswith('https://'):
msg = "Bad instance value %s. " % instance
msg += "Use either server/db format or full URL"
raise ConfigurationException(msg)
156 changes: 156 additions & 0 deletions src/python/CRABClient/Commands/setfilestatus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# pylint: disable=consider-using-f-string, unspecified-encoding
"""
allow users to (in)validate some files in their USER datasets in phys03
"""

import json

from CRABClient.Commands.SubCommand import SubCommand
from CRABClient.ClientExceptions import MissingOptionException, ConfigurationException, CommandFailedException
from CRABClient.ClientUtilities import colors
from CRABClient.Commands.setdatasetstatus import getDbsREST

try:
from CRABClient import __version__
except: # pylint: disable=bare-except
__version__ = '0.0.0'


class setfilestatus(SubCommand):
"""
Set status of a USER dataset in phys03,
optionally invalidates/revalidates all files in it
meant to replace https://github.com/dmwm/DBS/blob/master/Client/utils/DataOpsScripts/DBS3SetDatasetStatus.py
and to work whenever CRAB is supported, i.e. with both python2 and python3
"""

name = 'setfilestatus'

def __init__(self, logger, cmdargs=None):
SubCommand.__init__(self, logger, cmdargs)

def __call__(self):

result = 'FAILED' # will change to 'SUCCESS' when all is OK

# intitalize, and validate args
instance = self.options.instance
dataset = self.options.dataset
files = self.options.files
status = self.options.status
self.logger.debug('instance = %s' % instance)
self.logger.debug('dataset = %s' % dataset)
self.logger.debug('files = %s' % files)
self.logger.debug('status = %s' % status)

statusToSet = 1 if status == 'VALID' else 0

filesToChange = None
if files:
# did the user specify the name of a file containing a list of LFN's ?
try:
with open(files, 'r') as f:
flist = [lfn.strip() for lfn in f]
filesToChange = ','.join(flist)
except IOError:
# no. Assume we have a comma separated list of LFN's (a single LFN is also OK)
filesToChange = files.strip(",").strip()
finally:
# files and dataset options are mutually exclusive
dataset = None
if ',' in filesToChange:
raise NotImplementedError('list of LFNs is not supported yet')

# from DBS instance, to DBS REST services
dbsReader, dbsWriter = getDbsREST(instance=instance, logger=self.logger,
cert=self.proxyfilename, key=self.proxyfilename,
version=__version__)
# we will need the dataset name
if dataset:
datasetName = dataset
else:
# get it from DBS
lfn = filesToChange.split(',')[0]
query = {'logical_file_name': lfn}
out, rc, msg = dbsReader.get(uri='datasets', data=query)
if not out:
self.logger.error("ERROR: file %s not found in DBS" % lfn)
raise ConfigurationException
datasetName = out[0]['dataset']
self.logger.info('LFN to be changed belongs to dataset %s' % datasetName)

# when acting on a list of LFN's, can't print status of all files before/after
# best we can do is to print the number of valid/invalid file in the dataset
# before/after.

self.logFilesTally(dataset=datasetName, dbs=dbsReader)

if filesToChange:
data = {'logical_file_name': filesToChange, 'is_file_valid': statusToSet}
if dataset:
data = {'dataset': dataset, 'is_file_valid': statusToSet}
jdata = json.dumps(data) # PUT requires data in JSON format
out, rc, msg = dbsWriter.put(uri='files', data=jdata)
if rc == 200 and msg == 'OK':
self.logger.info("File(s) status changed successfully")
result = 'SUCCESS'
else:
msg = "File(s) status change failed: %s" % out
raise CommandFailedException(msg)

self.logFilesTally(dataset=datasetName, dbs=dbsReader)

return {'commandStatus': result}

def logFilesTally(self, dataset=None, dbs=None):
""" prints total/valid/invalid files in dataset """
query = {'dataset': dataset, 'validFileOnly': 1}
out, _, _ = dbs.get(uri='files', data=query)
valid = len(out)
query = {'dataset': dataset, 'validFileOnly': 0}
out, _, _ = dbs.get(uri='files', data=query)
total = len(out)
invalid = total - valid
self.logger.info("Dataset file count total/valid/invalid = %d/%d/%d" % (total, valid, invalid))

def setOptions(self):
"""
__setOptions__
This allows to set specific command options
"""
self.parser.add_option('-i', '--instance', dest='instance', default='prod/phys03',
help='DBS instance. e.g. prod/phys03 (default) or int/phys03'
)
self.parser.add_option('-d', '--dataset', dest='dataset', default=None,
help='Will apply status to all files in this dataset.' + \
' Use either --files or--dataset',
metavar='<dataset_name>')
self.parser.add_option('-s', '--status', dest='status', default=None,
help='New status of the file(s): VALID/INVALID',
choices=['VALID', 'INVALID']
)
self.parser.add_option('-f', '--files', dest='files', default=None,
help='List of files to be validated/invalidated.' + \
' Can be either a simple LFN or a file containg LFNs or' + \
' a comma separated list of LFNs. Use either --files or --dataset',
metavar="<lfn1[,..,lfnx] or filename>")

def validateOptions(self):
SubCommand.validateOptions(self)

if not self.options.files and not self.options.dataset:
msg = "%sError%s: Please specify the files to change." % (colors.RED, colors.NORMAL)
msg += " Use either the --files or the --dataset option."
ex = MissingOptionException(msg)
ex.missingOption = "files"
raise ex
if self.options.files and self.options.dataset:
msg = "%sError%s: You can not use both --files and --dataset at same time" % (colors.RED, colors.NORMAL)
raise ConfigurationException(msg)
if self.options.status is None:
msg = "%sError%s: Please specify the new file(s) status." % (colors.RED, colors.NORMAL)
msg += " Use the --status option."
ex = MissingOptionException(msg)
ex.missingOption = "status"
raise ex
Loading

0 comments on commit 330747d

Please sign in to comment.