# 03: Solutions to Useful standard library modules exercises

In [1]:
import os
from pathlib import Path
import shutil
import subprocess
import sys
import zipfile

## Exercise: Make a script with a command line argument using sys.argv

1) Using a text editor such as VSCode, make a new ``*.py`` file with the following contents:

```python
import sys

if len(sys.argv) > 1:
    for argument in sys.argv[1:]:
        print(argument)
else:
    print("usage is: python <script name>.py argument")
    quit()
```

2) Try running the script at the command line

In [2]:
write_text = (
    'import sys\n\n'
    'if len(sys.argv) > 1:\n'
    '    for argument in sys.argv[1:]:\n'
    '        print(argument)\n'
    'else:\n'
    '   print("usage is: python <script name>.py argument")\n'
    'quit()\n'
)

with open('myscript.py', 'w') as dest:
    dest.write(write_text)

In [3]:
result = subprocess.run(['python', 'myscript.py'], check=True)
result.stdout

usage is: python <script name>.py argument


In [4]:
result = subprocess.run(['python', 'myscript.py', 'arg1', 'arg2'], check=True)
result.stdout

arg1
arg2


## Testing Your Skills with a truly awful example:

### the problem:
Pretend that the file `data/fileio/netcdf_data.zip` contains some climate data (in the NetCDF format with the ``*.nc`` extension) that we downloaded. If you open `data/fileio/netcdf_data.zip`, you'll see that within a subfolder `zipped` are a bunch of additional subfolders, each for a different year. Within each subfolder is another zipfile. Within each of these zipfiles is yet another subfolder, inside of which is the actual data file we want (`prcp.nc`). 

In [5]:
with zipfile.ZipFile('../data/netcdf_data.zip') as src:
    for f in src.namelist()[:10]:
        print(f)

netcdf_data/
netcdf_data/zipped/
netcdf_data/zipped/zipped_1991/
netcdf_data/zipped/zipped_1991/12270_1991.zip
netcdf_data/zipped/zipped_1996/
netcdf_data/zipped/zipped_1996/12270_1996.zip
netcdf_data/zipped/zipped_1998/
netcdf_data/zipped/zipped_1998/12270_1998.zip
netcdf_data/zipped/zipped_1999/
netcdf_data/zipped/zipped_1999/12270_1999.zip


### the goal:
To extract all of these `prcp.nc` files into a single folder, after renaming them with their respective years (obtained from their enclosing folders or zip files). e.g.  
```
prcp_1980.nc
prcp_1981.nc
...
```
This will allow us to open them together as a dataset in `xarray` (more on that later). Does this sound awful? I'm not making this up. This is the kind of structure you get when downloading tiles of climate data with the [Daymet Tile Selection Tool](https://daymet.ornl.gov/gridded/)

### hint:
you might find these functions helpful:
```
ZipFile.extractall
ZipFile.extract
Path.glob
Path.mkdir
Path.stem
Path.parent
Path.name
shutil.move
Path.rmdir()


os.path.isdir
os.makedirs

os.path.split
os.path.splitext
os.path.join
os.rename
os.rmdir
```

### hint: start by using ``ZipFile.extractall()`` to extract all of the individual zip files from the main zip archive
This extracts the entire contents of the zip file to a designated folder

In [6]:
output_folder = Path('../03-output')
output_folder.mkdir(exist_ok=True)

with zipfile.ZipFile('../data/netcdf_data.zip') as src:
    src.extractall(output_folder)

Make a list of the zipfiles

In [7]:
zipfiles = list(output_folder.glob('netcdf_data/zipped/*/*.zip'))
zipfiles[:5]

[PosixPath('../03-output/netcdf_data/zipped/zipped_1991/12270_1991.zip'),
 PosixPath('../03-output/netcdf_data/zipped/zipped_1996/12270_1996.zip'),
 PosixPath('../03-output/netcdf_data/zipped/zipped_1998/12270_1998.zip'),
 PosixPath('../03-output/netcdf_data/zipped/zipped_1999/12270_1999.zip'),
 PosixPath('../03-output/netcdf_data/zipped/zipped_1997/12270_1997.zip')]

### Part 1: extract with a single file

In [8]:
f = zipfiles[0]
f

PosixPath('../03-output/netcdf_data/zipped/zipped_1991/12270_1991.zip')

#### 1a) Use ``ZipFile.namelist()`` (as above) list the contents

This will yield the name of the ``*.nc`` file that we need to extract

In [9]:
with zipfile.ZipFile(f) as src:
    nc_file = src.namelist()[0]
print(nc_file)

12270_1991/prcp.nc


#### 1b) Use ``ZipFile.extract()`` to extract the ``*.nc`` file to the destination folder
(you may need to create the destination folder first)

In [10]:
with zipfile.ZipFile(f) as src:
    src.extract(nc_file, output_folder)

#### 1c) Move the extracted file out of any enclosing subfolders, and rename to ``prcp_<year>.nc``
(so that if we repeat this for subsequent files, the extracted ``*.nc`` files will end up in the same place)

In [11]:
# make a path for the extracted file
extracted_path = output_folder / nc_file
extracted_path

PosixPath('../03-output/12270_1991/prcp.nc')

In [12]:
# make a path for the new file
nc_file = Path(nc_file)
variable = nc_file.stem
year = nc_file.parent.name.split('_')[1]
new_file = output_folder / f"{variable}_{year}.nc"
new_file

PosixPath('../03-output/prcp_1991.nc')

In [13]:
# do the move
shutil.move(extracted_path, new_file)

PosixPath('../03-output/prcp_1991.nc')

#### 1d) Remove the extra subfolders that were extracted

In [14]:
extracted_path.parent.rmdir()

### Part 2: put the above steps together into a loop to repeat the workflow for all of the NetCDF files

In [15]:
for f in zipfiles:
    with zipfile.ZipFile(f) as src:
        
        # get the NetCDF file
        nc_file = src.namelist()[0]
        
        # extract it to the output folder
        src.extract(nc_file, output_folder)
        
        # make a path for the extracted file
        extracted_path = output_folder / nc_file
        
        # make a path for the new file
        nc_file = Path(nc_file)
        variable = nc_file.stem
        year = nc_file.parent.name.split('_')[1]
        new_file = output_folder / f"{variable}_{year}.nc"
        
        # move the extracted NetCDF file to the dest. location
        shutil.move(extracted_path, new_file)
        
        # remove the subfolders that were extracted
        extracted_path.parent.rmdir()
        
        print(f"{f}/{nc_file} --> {new_file}")

../03-output/netcdf_data/zipped/zipped_1991/12270_1991.zip/12270_1991/prcp.nc --> ../03-output/prcp_1991.nc
../03-output/netcdf_data/zipped/zipped_1996/12270_1996.zip/12270_1996/prcp.nc --> ../03-output/prcp_1996.nc
../03-output/netcdf_data/zipped/zipped_1998/12270_1998.zip/12270_1998/prcp.nc --> ../03-output/prcp_1998.nc
../03-output/netcdf_data/zipped/zipped_1999/12270_1999.zip/12270_1999/prcp.nc --> ../03-output/prcp_1999.nc
../03-output/netcdf_data/zipped/zipped_1997/12270_1997.zip/12270_1997/prcp.nc --> ../03-output/prcp_1997.nc
../03-output/netcdf_data/zipped/zipped_1990/12270_1990.zip/12270_1990/prcp.nc --> ../03-output/prcp_1990.nc
../03-output/netcdf_data/zipped/zipped_2003/12270_2003.zip/12270_2003/prcp.nc --> ../03-output/prcp_2003.nc
../03-output/netcdf_data/zipped/zipped_2004/12270_2004.zip/12270_2004/prcp.nc --> ../03-output/prcp_2004.nc
../03-output/netcdf_data/zipped/zipped_2005/12270_2005.zip/12270_2005/prcp.nc --> ../03-output/prcp_2005.nc
../03-output/netcdf_data/zip

### Another way to do this using ``os`` instead of ``pathlib``
(from the 2018 Madison Python class)

In [16]:
# declare a destination path
dest_path = 'extracted_data'
variable = 'prcp'

for f in zipfiles:
    with zipfile.ZipFile(f) as src:
        # get the path to the source file and the year
        _, fname = os.path.split(f)
        name = os.path.splitext(fname)[0].replace('.tar', '')
        srcfile = '{}/{}.nc'.format(name, variable)
        year = name.split('_')[1]

        # where we want the extracted .nc file to end up
        destfile = os.path.join(dest_path, '{}_{}.nc'.format(variable, year))

        # extract the srcfile path to the /daymet folder
        # unfortunately this extracts the whole path, not just the file
        src.extract(srcfile, dest_path)
        # move the file up from subfolders to /daymet
        shutil.move(os.path.join(dest_path, srcfile), dest_path)
        # rename to include year
        os.rename(os.path.join(dest_path, '{}.nc'.format(variable)),
                  destfile)
        # trash subfolders that were extracted
        os.rmdir(os.path.join(dest_path, name))
        print('{}/{} --> {}'.format(f, srcfile, destfile))

../03-output/netcdf_data/zipped/zipped_1991/12270_1991.zip/12270_1991/prcp.nc --> extracted_data/prcp_1991.nc
../03-output/netcdf_data/zipped/zipped_1996/12270_1996.zip/12270_1996/prcp.nc --> extracted_data/prcp_1996.nc
../03-output/netcdf_data/zipped/zipped_1998/12270_1998.zip/12270_1998/prcp.nc --> extracted_data/prcp_1998.nc
../03-output/netcdf_data/zipped/zipped_1999/12270_1999.zip/12270_1999/prcp.nc --> extracted_data/prcp_1999.nc
../03-output/netcdf_data/zipped/zipped_1997/12270_1997.zip/12270_1997/prcp.nc --> extracted_data/prcp_1997.nc
../03-output/netcdf_data/zipped/zipped_1990/12270_1990.zip/12270_1990/prcp.nc --> extracted_data/prcp_1990.nc
../03-output/netcdf_data/zipped/zipped_2003/12270_2003.zip/12270_2003/prcp.nc --> extracted_data/prcp_2003.nc
../03-output/netcdf_data/zipped/zipped_2004/12270_2004.zip/12270_2004/prcp.nc --> extracted_data/prcp_2004.nc
../03-output/netcdf_data/zipped/zipped_2005/12270_2005.zip/12270_2005/prcp.nc --> extracted_data/prcp_2005.nc
../03-outp