%%bash
system_profiler SPSoftwareDataType
Software: System Software Overview: System Version: Mac OS X 10.7.5 (11G63) Kernel Version: Darwin 11.4.2 Boot Volume: SSD2 Boot Mode: Normal Computer Name: greenbird (2) User Name: Sam (Sam) Secure Virtual Memory: Enabled 64-bit Kernel and Extensions: No Time since boot: 28 days 18:18
%%bash
head -5 /Volumes/web/Arabidopsis/20151229_duplicate_files_eagle_archive.txt
#2 x 2,765,343,018 (2,765,343,232) bytes wasted /run/user/1000/gvfs/smb-share:server=eagle.fish.washington.edu,share=archive/NGS Raw Data/Burge_Laby/10233509_709JBAAXX_s_6_sequence.gz /run/user/1000/gvfs/smb-share:server=eagle.fish.washington.edu,share=archive/NGS Raw Data/Burge_Laby/10233509_709JBAAXX_s_6_sequence2.gz #2 x 1,337,988,079 (1,337,988,096) bytes wasted /run/user/1000/gvfs/smb-share:server=eagle.fish.washington.edu,share=archive/NGS Raw Data/Friedman_Oly_broodstock/106A Female Mix/filtered_106A_Female_Mix_GATCAG_L007_R1.fastq.gz
%%bash
#This script calculates the total wasted bytes of files >100MB from the output
#of the Linux program fslint.
#Use grep to identify lines beginning with '#', extract field #4 (awk '{print $4} which contains
#the bytes wasted entry, then remove the parentheses and commas so that bash stores the value as an integer.
#Save all resulting values to the variable "dupes"
dupes=$(grep '^\#' /Volumes/web/Arabidopsis/20151229_duplicate_files_eagle_web.txt | awk '{print $4}' | tr -d '(),')
#Initialize variable "running_total" to 0.
running_total=0
#Initialize variable "single" to 0.
single=0
#For loop to process the values in the variable "dupes".
for number in $dupes
do
if [ $number -ge 1000000 ] #If the value in number is greater than or equal to 100MB
then
single=$number #Value of "number" is assigned to "single"
running_total=$((running_total+single)) #Adds "running_total" to "single" and assigns value to "running_total"
fi
done
#Converts bytes to gigabytes and assigns to variable "bytes_to_gigs"
bytes_to_gigs=$((running_total/1000000000))
#Prints total of bytes wasted calculations in bytes and gigabytes.
echo $running_total" bytes wasted"
echo $bytes_to_gigs" gigabytes wasted"
713041788416 bytes wasted 713 gigabytes wasted
%%bash
#This script calculates the total wasted bytes of files >100MB from the output
#of the Linux program fslint.
#Use grep to identify lines beginning with '#', extract field #4 (awk '{print $4} which contains
#the bytes wasted entry, then remove the parentheses and commas so that bash stores the value as an integer.
#Save all resulting values to the variable "dupes"
dupes=$(grep '^\#' /Volumes/web/Arabidopsis/20151229_duplicate_files_eagle_archive.txt | awk '{print $4}' | tr -d '(),')
#Initialize variable "running_total" to 0.
running_total=0
#Initialize variable "single" to 0.
single=0
#For loop to process the values in the variable "dupes".
for number in $dupes
do
if [ $number -ge 1000000 ] #If the value in number is greater than or equal to 100MB
then
single=$number #Value of "number" is assigned to "single"
running_total=$((running_total+single)) #Adds "running_total" to "single" and assigns value to "running_total"
fi
done
#Converts bytes to gigabytes and assigns to variable "bytes_to_gigs"
bytes_to_gigs=$((running_total/1000000000))
#Prints total of bytes wasted calculations in bytes and gigabytes.
echo $running_total" bytes wasted"
echo $bytes_to_gigs" gigabytes wasted"
19525825024 bytes wasted 19 gigabytes wasted
Below is how the file looks.
Has ugly remote path due to running the fslint program remotely from a Linux machine.
%%bash
head /Volumes/web/Arabidopsis/20151229_duplicate_files_eagle_archive.txt
#2 x 2,765,343,018 (2,765,343,232) bytes wasted /run/user/1000/gvfs/smb-share:server=eagle.fish.washington.edu,share=archive/NGS Raw Data/Burge_Laby/10233509_709JBAAXX_s_6_sequence.gz /run/user/1000/gvfs/smb-share:server=eagle.fish.washington.edu,share=archive/NGS Raw Data/Burge_Laby/10233509_709JBAAXX_s_6_sequence2.gz #2 x 1,337,988,079 (1,337,988,096) bytes wasted /run/user/1000/gvfs/smb-share:server=eagle.fish.washington.edu,share=archive/NGS Raw Data/Friedman_Oly_broodstock/106A Female Mix/filtered_106A_Female_Mix_GATCAG_L007_R1.fastq.gz /run/user/1000/gvfs/smb-share:server=eagle.fish.washington.edu,share=archive/armina/filtered_106A_Female_Mix_GATCAG_L007_R1.fastq.gz #2 x 1,034,159,435 (1,034,159,616) bytes wasted /run/user/1000/gvfs/smb-share:server=eagle.fish.washington.edu,share=archive/filefish/MBD_meth_refmap_v030.sam /run/user/1000/gvfs/smb-share:server=eagle.fish.washington.edu,share=archive/site_sucker/aquacul4.fish.washington.edu/~steven/filefish/MBD_meth_refmap_v030.sam #2 x 1,011,649,653 (1,011,650,048) bytes wasted
%%bash
#Use sed to edit out extra file path info from fslint output file for Eagle/archive
sed 's/\/run\/user\/1000\/gvfs\/smb-share\:server\=eagle.fish.washington.edu\,share\=//g' \
/Volumes/web/Arabidopsis/20151229_duplicate_files_eagle_archive.txt \
> /Volumes/web/Arabidopsis/20151229_duplicate_files_eagle_archive_cleaned.txt
%%bash
#View file after editing with sed
head /Volumes/web/Arabidopsis/20151229_duplicate_files_eagle_archive_cleaned.txt
#2 x 2,765,343,018 (2,765,343,232) bytes wasted archive/NGS Raw Data/Burge_Laby/10233509_709JBAAXX_s_6_sequence.gz archive/NGS Raw Data/Burge_Laby/10233509_709JBAAXX_s_6_sequence2.gz #2 x 1,337,988,079 (1,337,988,096) bytes wasted archive/NGS Raw Data/Friedman_Oly_broodstock/106A Female Mix/filtered_106A_Female_Mix_GATCAG_L007_R1.fastq.gz archive/armina/filtered_106A_Female_Mix_GATCAG_L007_R1.fastq.gz #2 x 1,034,159,435 (1,034,159,616) bytes wasted archive/filefish/MBD_meth_refmap_v030.sam archive/site_sucker/aquacul4.fish.washington.edu/~steven/filefish/MBD_meth_refmap_v030.sam #2 x 1,011,649,653 (1,011,650,048) bytes wasted
%%bash
head /Volumes/web/Arabidopsis/20160104_duplicate_files_eagle_web.txt
#2 x 116,741,159,757 (116,741,159,936) bytes wasted /run/user/1000/gvfs/smb-share:server=eagle.fish.washington.edu,share=web/cnidarian/Geo-Trinity2/trinity_out_dir/bowtie.nameSorted.sam /run/user/1000/gvfs/smb-share:server=eagle.fish.washington.edu,share=web/cnidarian/Geo-trinity/trinity_out_dir/bowtie.nameSorted.sam #2 x 57,464,638,599 (57,464,638,976) bytes wasted /run/user/1000/gvfs/smb-share:server=eagle.fish.washington.edu,share=web/cnidarian/Geo-Trinity2/trinity_out_dir/both.fa /run/user/1000/gvfs/smb-share:server=eagle.fish.washington.edu,share=web/cnidarian/Geo-trinity/trinity_out_dir/both.fa #2 x 16,807,324,618 (16,807,324,672) bytes wasted /run/user/1000/gvfs/smb-share:server=eagle.fish.washington.edu,share=web/Ichthyophonus/ICH_SNP/iplant_vcf_to_gff.gff /run/user/1000/gvfs/smb-share:server=eagle.fish.washington.edu,share=web/whale/fish546/module8/ICH_SNP/iplant_vcf_to_gff.gff #7 x 2,747,235,434 (16,483,415,040) bytes wasted
%%bash
#Use sed to edit out extra file path info from fslint output file for Eagle/archive
sed 's/\/run\/user\/1000\/gvfs\/smb-share\:server\=eagle.fish.washington.edu\,share\=//g' \
/Volumes/web/Arabidopsis/20160104_duplicate_files_eagle_web.txt \
> /Volumes/web/Arabidopsis/20160104_duplicate_files_eagle_web_cleaned.txt
%%bash
head /Volumes/web/Arabidopsis/20160104_duplicate_files_eagle_web_cleaned.txt
#2 x 116,741,159,757 (116,741,159,936) bytes wasted web/cnidarian/Geo-Trinity2/trinity_out_dir/bowtie.nameSorted.sam web/cnidarian/Geo-trinity/trinity_out_dir/bowtie.nameSorted.sam #2 x 57,464,638,599 (57,464,638,976) bytes wasted web/cnidarian/Geo-Trinity2/trinity_out_dir/both.fa web/cnidarian/Geo-trinity/trinity_out_dir/both.fa #2 x 16,807,324,618 (16,807,324,672) bytes wasted web/Ichthyophonus/ICH_SNP/iplant_vcf_to_gff.gff web/whale/fish546/module8/ICH_SNP/iplant_vcf_to_gff.gff #7 x 2,747,235,434 (16,483,415,040) bytes wasted