SIGN IN SIGN UP

Python Data Science Handbook: full text in Jupyter Notebooks

0 0 0 Jupyter Notebook
2017-08-14 13:04:58 -07:00
"""
This script copies all notebooks from the book into the website directory, and
creates pages which wrap them and link together.
"""
import os
import nbformat
2017-08-14 13:26:42 -07:00
import shutil
2017-08-14 13:04:58 -07:00
PAGEFILE = """title: {title}
2017-08-14 19:22:01 -07:00
url:
save_as: {htmlfile}
2017-08-14 14:02:21 -07:00
Template: {template}
2017-08-14 13:04:58 -07:00
2017-08-14 13:26:42 -07:00
{{% notebook notebooks/{notebook_file} cells[{cells}] %}}
2017-08-14 13:04:58 -07:00
"""
2017-08-15 07:53:23 -07:00
INTRO_TEXT = """This website contains the full text of the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook) in the form of Jupyter notebooks.
The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT).
If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!
"""
2017-08-14 13:04:58 -07:00
def abspath_from_here(*args):
here = os.path.dirname(__file__)
path = os.path.join(here, *args)
return os.path.abspath(path)
NB_SOURCE_DIR = abspath_from_here('..', 'notebooks')
NB_DEST_DIR = abspath_from_here('content', 'notebooks')
PAGE_DEST_DIR = abspath_from_here('content', 'pages')
def copy_notebooks():
if not os.path.exists(NB_DEST_DIR):
os.makedirs(NB_DEST_DIR)
if not os.path.exists(PAGE_DEST_DIR):
os.makedirs(PAGE_DEST_DIR)
2017-08-14 13:26:42 -07:00
nblist = sorted(nb for nb in os.listdir(NB_SOURCE_DIR)
if nb.endswith('.ipynb'))
2017-08-14 19:22:01 -07:00
name_map = {nb: nb.rsplit('.', 1)[0].lower() + '.html'
2017-08-14 13:04:58 -07:00
for nb in nblist}
2017-08-14 13:26:42 -07:00
figsource = abspath_from_here('..', 'notebooks', 'figures')
figdest = abspath_from_here('content', 'figures')
if os.path.exists(figdest):
shutil.rmtree(figdest)
shutil.copytree(figsource, figdest)
figurelist = os.listdir(abspath_from_here('content', 'figures'))
2017-08-14 14:02:21 -07:00
figure_map = {os.path.join('figures', fig) : os.path.join('/PythonDataScienceHandbook/figures', fig)
2017-08-14 13:26:42 -07:00
for fig in figurelist}
2017-08-14 13:04:58 -07:00
for nb in nblist:
base, ext = os.path.splitext(nb)
print('-', nb)
content = nbformat.read(os.path.join(NB_SOURCE_DIR, nb),
as_version=4)
2017-08-14 13:26:42 -07:00
if nb == 'Index.ipynb':
# content[0] is the title
# content[1] is the cover image
# content[2] is the license
2017-08-14 13:26:42 -07:00
cells = '1:'
2017-08-14 14:02:21 -07:00
template = 'page'
2017-08-14 13:26:42 -07:00
title = 'Python Data Science Handbook'
2017-08-15 07:53:23 -07:00
content.cells[2].source = INTRO_TEXT
2017-08-14 13:26:42 -07:00
else:
# content[0] is the book information
# content[1] is the navigation bar
# content[2] is the title
2017-08-14 13:26:42 -07:00
cells = '2:'
2017-08-14 14:02:21 -07:00
template = 'booksection'
2017-08-14 13:26:42 -07:00
title = content.cells[2].source
if not title.startswith('#') or len(title.splitlines()) > 1:
raise ValueError('title not found in third cell')
title = title.lstrip('#').strip()
2017-08-14 13:04:58 -07:00
2017-08-14 14:02:21 -07:00
# put nav below title
content.cells.insert(0, content.cells.pop(2))
2017-08-14 14:02:21 -07:00
# Replace internal URLs and figure links in notebook
2017-08-14 13:04:58 -07:00
for cell in content.cells:
if cell.cell_type == 'markdown':
for nbname, htmlname in name_map.items():
if nbname in cell.source:
cell.source = cell.source.replace(nbname, htmlname)
2017-08-14 13:26:42 -07:00
for figname, newfigname in figure_map.items():
if figname in cell.source:
cell.source = cell.source.replace(figname, newfigname)
if cell.source.startswith("<!--NAVIGATION-->"):
# Undo replacement of notebook link in the colab badge
cell.source = nb.join(cell.source.rsplit(name_map[nb], 1))
2017-08-14 13:04:58 -07:00
nbformat.write(content, os.path.join(NB_DEST_DIR, nb))
pagefile = os.path.join(PAGE_DEST_DIR, base + '.md')
2017-08-14 19:22:01 -07:00
htmlfile = base.lower() + '.html'
2017-08-14 13:04:58 -07:00
with open(pagefile, 'w') as f:
f.write(PAGEFILE.format(title=title,
2017-08-14 19:22:01 -07:00
htmlfile=htmlfile,
2017-08-14 13:26:42 -07:00
notebook_file=nb,
2017-08-14 14:02:21 -07:00
template=template,
2017-08-14 13:26:42 -07:00
cells=cells))
2017-08-14 13:04:58 -07:00
if __name__ == '__main__':
copy_notebooks()