From fea9eb938e2572ed6a13ce82abe2bfe0117d7492 Mon Sep 17 00:00:00 2001 From: Samir Reddigari Date: Thu, 8 Apr 2021 14:13:12 -0400 Subject: [PATCH 1/2] Add `pfdo`-style filtering --- bin/pfdicom_tagSub | 10 +++ pfdicom_tagSub/pfdicom_tagSub.py | 107 +++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+) diff --git a/bin/pfdicom_tagSub b/bin/pfdicom_tagSub index bfd4908..a63ae0c 100755 --- a/bin/pfdicom_tagSub +++ b/bin/pfdicom_tagSub @@ -292,6 +292,14 @@ parser.add_argument("-e", "--extension", help = "DICOM file extension", dest = 'extension', default = '') +parser.add_argument("-f", "--fileFilter", + help = "a list of comma separated string filters to apply across the input file space", + dest = 'fileFilter', + default = '') +parser.add_argument("-d", "--dirFilter", + help = "a list of comma separated string filters to apply across the input dir space", + dest = 'dirFilter', + default = '') parser.add_argument("-F", "--tagFile", help = "JSON formatted file containing tags to sub", dest = 'tagFile', @@ -370,6 +378,8 @@ pf_dicom_tagSub = pfdicom_tagSub.pfdicom_tagSub( inputDir = args.inputDir, inputFile = args.inputFile, extension = args.extension, + fileFilter = args.fileFilter, + dirFilter = args.dirFilter, outputDir = args.outputDir, outputFileStem = args.outputFileStem, outputLeafDir = args.outputLeafDir, diff --git a/pfdicom_tagSub/pfdicom_tagSub.py b/pfdicom_tagSub/pfdicom_tagSub.py index 63d11c2..73d4592 100755 --- a/pfdicom_tagSub/pfdicom_tagSub.py +++ b/pfdicom_tagSub/pfdicom_tagSub.py @@ -50,6 +50,8 @@ def declare_selfvars(self): self.str_tagStruct = '' self.str_tagFile = '' self.d_tagStruct = {} + self.fileFilter = '' + self.dirFilter = '' self.dp = None self.log = None @@ -92,6 +94,8 @@ def outputFile_process(str_outputFile): if key == 'tagFile': tagFile_process(value) if key == 'tagStruct': tagStruct_process(value) if key == 'verbosity': self.verbosityLevel = int(value) + if key == 'fileFilter': self.fileFilter = value + if key == 'dirFilter': self.dirFilter = value # Set logging self.dp = pfmisc.debug( @@ -273,6 +277,102 @@ def tags_substitute(self, **kwargs): ) return d_tagSub + def FS_filter(self, at_data, *args, **kwargs) -> dict: + """ + Apply a filter to the string space of file and directory + representations. + + The purpose of this method is to reduce the original space of + + "": [<"filesToProcess">] + + to only those paths and files that are relevant to the operation being + performed. Two filters are understood, a `fileFilter` that filters + filenames that match any of the passed search substrings from the CLI + `--fileFilter`, and a`dirFilter` that filters directories whose + leaf node match any of the passed `--dirFilter` substrings. + + The effect of these filters is hierarchical. First, the `fileFilter` + is applied across the space of files for a given directory path. The + files are subject to a logical OR operation across the comma separated + filter argument. Thus, a `fileFilter` of "png,jpg,body" will filter + all files that have the substrings of "png" OR "jpg" OR "body" in their + filenames. + + Next, if a `dirFilter` has been specified, the current string path + corresponding to the filenames being filtered is considered. Each + string in the comma separated `dirFilter` list is exacted, and if + the basename of the working directory contains the filter substring, + the (filtered) files are conserved. If the basename of the working + directory does not contain any of the `dirFilter` substrings, the + file list is discarded. + + Thus, a `dirFilter` of "100307,100556" and a fileFilter of "png,jpg" + will reduce the space of files to process to ONLY files that have + a parent directory of "100307" OR "100556" AND that contain either the + string "png" OR "jpg" in their file names. + """ + + b_status : bool = True + l_file : list = [] + l_dirHits : list = [] + l_dir : list = [] + str_path : str = at_data[0] + al_file : list = at_data[1] + + if len(self.fileFilter): + al_file = [x \ + for y in self.fileFilter.split(',') \ + for x in al_file if y in x] + + if len(self.dirFilter): + l_dirHits = [os.path.basename(str_path) \ + for y in self.dirFilter.split(',') \ + if y in os.path.basename(str_path)] + if len(l_dirHits): + # Remove any duplicates in the l_dirHits:. Duplicates can + # occur if the tokens in the filter expression map more than + # once into the leaf node in the , as a path that is + # + # /some/dir/in/the/space/1234567 + # + # and a search filter on the dirspace of "123,567" + [l_dir.append(x) for x in l_dirHits if x not in l_dir] + else: + # If no dir hits for this dir, then we zero out the + # file filter + al_file = [] + + if len(al_file): + al_file.sort() + l_file = al_file + b_status = True + else: + self.dp.qprint( "No valid files to analyze found in path %s!" % + str_path, comms = 'warn', level = 5) + l_file = None + b_status = False + return { + 'status': b_status, + 'l_file': l_file + } + + def filterFileHitList(self) -> dict: + """ + Entry point for filtering the file filter list + at each directory node. + """ + d_filterFileHitList = self.pf_tree.tree_process( + inputReadCallback = None, + analysisCallback = self.FS_filter, + outputWriteCallback = None, + applyResultsTo = 'inputTree', + applyKey = 'l_file', + persistAnalysisResults = True + ) + return d_filterFileHitList + + def run(self, *args, **kwargs): """ The run method calls the base class run() to @@ -285,6 +385,7 @@ def run(self, *args, **kwargs): b_status = True d_tagSub = {} b_timerStart = False + d_filter = {} self.dp.qprint( "Starting pfdicom_tagSub run... (please be patient while running)", @@ -306,7 +407,12 @@ def run(self, *args, **kwargs): timerStart = False ) + if d_pfdicom['status']: + if len(self.fileFilter) or len(self.dirFilter): + d_filter = self.filterFileHitList() + b_status = d_filter['status'] + str_startDir = os.getcwd() os.chdir(self.str_inputDir) if b_status: @@ -318,6 +424,7 @@ def run(self, *args, **kwargs): 'status': b_status, 'd_pfdicom': d_pfdicom, 'd_tagSub': d_tagSub, + 'd_filter': d_filter, 'runTime': other.toc() } From 0eb8f7be164f6f59db35b18ab94b25b887b8163e Mon Sep 17 00:00:00 2001 From: Samir Reddigari Date: Thu, 8 Apr 2021 14:37:52 -0400 Subject: [PATCH 2/2] Document filter arguments --- README.rst | 8 +++++++- bin/pfdicom_tagSub | 10 +++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 7ab657d..7047283 100644 --- a/README.rst +++ b/README.rst @@ -88,6 +88,12 @@ Command line arguments An optional extension to filter the DICOM files of interest from the . + [-f|--filefilter ] + A list of comma separated string filters to apply across the input file space + + [-d|--dirFilter ] + A list of comma separated string filters to apply across the input dir space + [-O|--outputDir ] The output root directory that will contain a tree structure identical to the input directory, and each "leaf" node will contain the analysis @@ -155,7 +161,7 @@ Perform a DICOM anonymization by processing specific tags: .. code:: bash pfdicom_tagSub \ - -e dcm \ + -f ".dcm" \ -I /var/www/html/normsmall \ -O /var/www/html/anon \ --tagStruct ' diff --git a/bin/pfdicom_tagSub b/bin/pfdicom_tagSub index a63ae0c..9c8de96 100755 --- a/bin/pfdicom_tagSub +++ b/bin/pfdicom_tagSub @@ -75,6 +75,8 @@ def synopsis(ab_shortOnly = False): -I|--inputDir \\ [-i|--inputFile ] \\ [-e|--extension ] \\ + [-f|--fileFilter ] \\ + [-d|--dirFilter ] \\ [-F|--tagFile ] | [-T|--tagStruct ] \\ [--threads ] \\ -O|--outputDir \\ @@ -86,7 +88,7 @@ def synopsis(ab_shortOnly = False): BRIEF EXAMPLE pfdicom_tagSub \\ - -e dcm \\ + -f ".dcm" \\ -I /var/www/html/normsmall \\ -O /var/www/html/anon \\ --tagStruct ' @@ -162,6 +164,12 @@ def synopsis(ab_shortOnly = False): An optional extension to filter the DICOM files of interest from the . + [-f|--filefilter ] + A list of comma separated string filters to apply across the input file space + + [-d|--dirFilter ] + A list of comma separated string filters to apply across the input dir space + [-O|--outputDir ] The output root directory that will contain a tree structure identical to the input directory, and each "leaf" node will contain the analysis