media

#default_exp media

Convert HTML and Images to MDX

Make allowances for plots and dataframes in MDX

#export
from nbconvert.preprocessors import Preprocessor
from fastcore.xtras import Path
from html.parser import HTMLParser

#hide
from nbdoc.test_utils import run_preprocessor

#export
class HTMLdf(HTMLParser):
    """HTML Parser that finds a dataframe."""
    df = False
    scoped = False
    
    def handle_starttag(self, tag, attrs):
        if tag == 'style':
            for k,v in attrs:
                if k == 'scoped': self.scoped=True

    def handle_data(self, data):
        if '.dataframe' in data and self.scoped:
            self.df=True
        
    def handle_endtag(self, tag):
        if tag == 'style': self.scoped=False
                
    @classmethod
    def search(cls, x):
        parser = cls()
        parser.feed(x)
        return parser.df

_test_html = """<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>"""

assert HTMLdf.search(_test_html)
assert not HTMLdf.search('<div></div>')

#export
class HTMLEscape(Preprocessor):
    """
    Place HTML in a codeblock and surround it with a <HTMLOutputBlock> component.
    """    
    def preprocess_cell(self, cell, resources, index):
        if cell.cell_type =='code':
            outputs = []
            for o in cell.outputs:
                if o.get('data') and o['data'].get('text/html'):
                    cell.metadata.html_output = True
                    html = o['data']['text/html']
                    cell.metadata.html_center = False if HTMLdf.search(html) else True
                    o['data']['text/html'] = '```html\n'+html.strip()+'\n```'
        return cell, resources

By default, HTML is incompatible with MDX. We place HTML in a code block and wrap it with the a custom component so that the static site generator can render it.

c, _ = run_preprocessor([HTMLEscape], 'test_files/pandas.ipynb', display_results=True)
assert '<HTMLOutputBlock' in c and '</HTMLOutputBlock>' in c and 'center' not in c
assert '```html\n<div>' in c and '</div>\n```' in c

```python
import pandas as pd
pd.read_csv('https://github.com/outerbounds/.data/raw/main/hospital_readmission.csv').head().iloc[:, :15]
```
    
<HTMLOutputBlock >




```html
<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>time_in_hospital</th>
      <th>num_lab_procedures</th>
      <th>num_procedures</th>
      <th>num_medications</th>
      <th>number_outpatient</th>
      <th>number_emergency</th>
      <th>number_inpatient</th>
      <th>number_diagnoses</th>
      <th>race_Caucasian</th>
      <th>race_AfricanAmerican</th>
      <th>gender_Female</th>
      <th>age_[70-80)</th>
      <th>age_[60-70)</th>
      <th>age_[50-60)</th>
      <th>age_[80-90)</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>14</td>
      <td>41</td>
      <td>0</td>
      <td>11</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>6</td>
      <td>True</td>
      <td>False</td>
      <td>False</td>
      <td>False</td>
      <td>False</td>
      <td>True</td>
      <td>False</td>
    </tr>
    <tr>
      <th>1</th>
      <td>2</td>
      <td>30</td>
      <td>0</td>
      <td>12</td>
      <td>0</td>
      <td>0</td>
      <td>1</td>
      <td>9</td>
      <td>True</td>
      <td>False</td>
      <td>True</td>
      <td>False</td>
      <td>False</td>
      <td>True</td>
      <td>False</td>
    </tr>
    <tr>
      <th>2</th>
      <td>5</td>
      <td>66</td>
      <td>0</td>
      <td>22</td>
      <td>1</td>
      <td>0</td>
      <td>2</td>
      <td>9</td>
      <td>True</td>
      <td>False</td>
      <td>True</td>
      <td>False</td>
      <td>False</td>
      <td>False</td>
      <td>True</td>
    </tr>
    <tr>
      <th>3</th>
      <td>3</td>
      <td>63</td>
      <td>0</td>
      <td>8</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>8</td>
      <td>True</td>
      <td>False</td>
      <td>True</td>
      <td>False</td>
      <td>False</td>
      <td>True</td>
      <td>False</td>
    </tr>
    <tr>
      <th>4</th>
      <td>5</td>
      <td>40</td>
      <td>0</td>
      <td>6</td>
      <td>0</td>
      <td>0</td>
      <td>1</td>
      <td>9</td>
      <td>True</td>
      <td>False</td>
      <td>True</td>
      <td>False</td>
      <td>False</td>
      <td>False</td>
      <td>True</td>
    </tr>
  </tbody>
</table>
</div>
```



</HTMLOutputBlock>

#hide
c, _ = run_preprocessor([HTMLEscape], 'test_files/altair.ipynb')
assert 'center' in c

#export
class ImageSave(Preprocessor):
    "Saves images stored as bytes in notebooks to disk."
    def preprocess(self, nb, resources):
        meta = resources.get('metadata', {})
        nb_name = meta.get('name')
        nb_path = meta.get('path')
        outfiles = resources.get('outputs')
        if nb_name and outfiles:
            resources['fmap'] = {}
            for k,v in outfiles.items():
                dest = Path(nb_path)/f'_{nb_name}_files/{k}'
                dest.parent.mkdir(exist_ok=True)
                dest.write_bytes(v)
                resources['fmap'][f'{k}'] = f'_{nb_name}_files/{k}'       
        return nb, resources

class ImagePath(Preprocessor):
    "Changes the image path to the location where `ImageSave` saved the files."
    def preprocess_cell(self, cell, resources, index): 
        fmap = resources.get('fmap')
        if fmap:
            for o in cell.get('outputs', []):
                fnames = o.get('metadata', {}).get('filenames', {})
                for k,v in fnames.items():
                    fnames[k] = fmap.get(v,v)
        return cell, resources

ImageSave and ImagePath must be used together to extract and save images from notebooks and change the path. This is necessary to enable compatiblity with certain types of plotting libraries like matplotlib.

c, _ = run_preprocessor([ImageSave, ImagePath], 'test_files/matplotlib.ipynb', display_results=True)
assert '![png](_matplotlib_files/output_0_1.png)' in c

```python
from matplotlib import pyplot as plt
plt.plot(range(20), range(20))
plt.plot(range(10), range(10))
```

<CodeOutputBlock lang="python">

    [<matplotlib.lines.Line2D at 0x7f9d6922faf0>]

![png](_matplotlib_files/output_0_1.png)
    
</CodeOutputBlock>

c, _ = run_preprocessor([ImageSave, ImagePath], 'test_files/altair_jpeg.ipynb')
assert '![svg](_altair_jpeg_files/output_0_0.svg' in c