diff --git a/06_HTML_XML_JSON.ipynb b/06_HTML_XML_JSON.ipynb
new file mode 100644
index 0000000..dd5c5e4
--- /dev/null
+++ b/06_HTML_XML_JSON.ipynb
@@ -0,0 +1,1347 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "import requests\n",
+ "import json\n",
+ "\n",
+ "import os\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "from bs4 import BeautifulSoup\n",
+ "\n",
+ "import boto3\n",
+ "\n",
+ "from IPython.display import HTML, display"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#quick warmup\n",
+ "\n",
+ "# imagine a key-value data container\n",
+ "inventory = {\n",
+ " \"001\": [{\"name\": \"Milk\", \"quantity\": 34, \"price\": 1.99}],\n",
+ " \"002\": [{\"name\": \"Bread\", \"quantity\": 20, \"price\": 2.49},\n",
+ " {\"name\": \"Nutella\", \"quantity\": 5, \"price\": 2.49}] \n",
+ " }\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#How would a store manager find out how much is milk\n",
+ "\n",
+ "#How do we add new item? What do we need to do?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "name_to_item = {}\n",
+ "for item_id, details_list in inventory.items():\n",
+ " for details in details_list:\n",
+ " # Here we are assuming product names are unique\n",
+ " name_to_item[details[\"name\"]] = {\"id\": item_id, \"quantity\": details[\"quantity\"], \"price\": details[\"price\"]}\n",
+ "#if details[\"name\"] is not unique, what happens?\n",
+ "print(name_to_item)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Lecture 05 - JSON, XML, HTML, Requests and APIs\n",
+ "by Jan Šíla, Vítek Macháček
\n",
+ "March 26, 2024\n",
+ "\n",
+ "### Contents\n",
+ "\n",
+ "* Standardized data representation\n",
+ "* JSON\n",
+ "* XML\n",
+ "* Introduction to BeautifulSoup\n",
+ "* Basics of HTML (+ Element Inspection)\n",
+ "* Introduction to Requests (GET vs. POST) and APIs\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Goals:\n",
+ " \n",
+ "* work with data online/real-time data\n",
+ "* acquisition, processing - > results"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Date exchange formats - JSON, XML\n",
+ "\n",
+ "`Language of the internet`\n",
+ "\n",
+ "* You can send/receive a message with (almost) any service\n",
+ "\n",
+ "* send .docx -> what if I do not have MS Word?\n",
+ "* we need a simple data format which would work on any machine (system agnostic), is general (can write anything) and is ediatable in basic editors\n",
+ "\n",
+ "* More complex than simple tables\n",
+ "* Highly structured - if you dont follow the rules, you are out\n",
+ "* Both sides need to understand the structure (comments in yaml)\n",
+ "* only data, no code to be run (security measure)\n",
+ "* distributed as text/string (to be precise as `bytes` literals) \n",
+ "* parsed to objects - easy to work with straight away\n",
+ "* Can be persisted as special files, or some data streams from APIs. \n",
+ "* Human readable\n",
+ "* Hierarchical\n",
+ "* Can be fetched using standard web APIs\n",
+ "\n",
+ "### Purpose\n",
+ "\n",
+ "1. Communication \n",
+ " * All imaginable communication channels\n",
+ " * Applications within single server/machine\n",
+ " * Only transferring of data\n",
+ " * Both sides need to understand the structure\n",
+ "\n",
+ "2. Storing\n",
+ " * self-descriptive\n",
+ " * human readable\n",
+ " * also in DBs - SQL, MongoDB etc.\n",
+ "\n",
+ "3. Standardization\n",
+ " * predictability\n",
+ " * cooperation\n",
+ " * spillovers from standardization\n",
+ "\n",
+ "\n",
+ "### Dimensionality problem\n",
+ "\n",
+ "* rich information comes at costs of data complexity \n",
+ "* to interrelate information, you need to high dimensionality (or A LOT of columns) or declaratory formats such as protobuf\n",
+ "* Strongly object-oriented\n",
+ "\n",
+ "\n",
+ "### 1D:\n",
+ "* logs\n",
+ "\n",
+ "### 2D: CSVs\n",
+ "* tabular data (like pandas DFs)\n",
+ "\n",
+ "### 3+D:\n",
+ "#### XML\n",
+ "* eXtensible Markup Language is a software- and hardware-independent tool for storing and transporting data.\n",
+ "* Officialy defined at 1998, but its roots are even older.\n",
+ "* XML was designed to carry data - with focus on what data is\n",
+ "* HTML was designed to display data - with focus on what data should look like displayed \n",
+ "* XML tags are not predefined like HTML tags are\n",
+ "* more verbose than JSON\n",
+ "* can have comments !actually a really cool in useful feature!\n",
+ "* used historically as a transaction format in many areas: \n",
+ " * Scientific measurements\n",
+ " * News information\n",
+ " * Wheather measurements\n",
+ " * Financial transactions\n",
+ "* Necessary to use XML parser to use in Python or in JavaScript\n",
+ "\n",
+ "\n",
+ "### JSON\n",
+ "* JavaScript Object Notation\n",
+ "* often *.json* files\n",
+ "* but also used in the web etc.\n",
+ "* supports standard datatypes - strings, integers, floats, lists\n",
+ "* No comments\n",
+ "* More compact, less verbose\n",
+ "* No closing tags\n",
+ "* Used EVERYWHERE, BUT [NOT LICENSED FOR EVIL](https://www.json.org/license.html). If you want to do evil stuff, use XML instead.\n",
+ "* Native in JavaScript and close to native in Python (dictionary)\n",
+ "* Jupyter Notebooks\n",
+ "\n",
+ "\n",
+ "* common pitfals: properly formatted JSON is different to python dict. -> check: https://jsonlint.com/\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# JSON"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# general representation of a dictionary\n",
+ "# emphasis on accessibility -> key-value ( hash table )\n",
+ "# contains records, lists, or other dictionaries\n",
+ "\n",
+ "teachers = [\n",
+ " {'name':'Jozef Baruník','titles':['doc.','PhDr.','Ph.D.','Bc.','Mgr.'],'ID':1234,'courses':['JEM005','JEM116','JEM059','JEM061']},\n",
+ " {'name':'Martin Hronec','titles':['Bc.','Mgr.'],'ID':3421,'courses':['JEM005','JEM207']},\n",
+ "]\n",
+ "\n",
+ "courses = {\n",
+ " \"JEM005\":{'name':'Advanced Econometrics','ECTS':6,'teachers':[3421,1234]},\n",
+ " 'JEM207':{'name':'Data Processing in Python','ECTS':5,'teachers':[3421]},\n",
+ " 'JEM116':{'name':'Applied Econometrics','ECTS':6,'teachers':[1234]},\n",
+ " 'JEM059':{'name':'Quantitative Finance I.','ECTS':6,'teachers':[1234,5678]},\n",
+ " 'JEM061':{'name':'Quantitative Finance II.','ECTS':6,'teachers':[1234,5678]}\n",
+ "}\n",
+ "jsondata = {'teachers':teachers,'courses':courses}\n",
+ "jsondata"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "is this a valid JSON?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "https://jsonformatter.curiousconcept.com/"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "![python and JSON](./06_pics/python_json.png)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "js = json.dumps(\n",
+ " jsondata\n",
+ ") #json formatted string!\n",
+ "\n",
+ "isinstance(js,str)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "js"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "jsondata['courses']['JEM005']['test']='test'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.DataFrame(jsondata['courses']).transpose()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfc = pd.read_json(json.dumps(jsondata['courses']),orient='index')\n",
+ "dfc"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# lets come back to this a little later"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# eXtensible Markup Language (XML)\n",
+ "\n",
+ "* elements\n",
+ "* attributes\n",
+ "* tags\n",
+ "\n",
+ "### Tag\n",
+ "> <>\n",
+ "\n",
+ "### Element"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
+ "source": [
+ "### Convert to python data-types"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#either\n",
+ "'''content'''\n",
+ "\n",
+ "#or self-closing (no content)\n",
+ "'''''';\n",
+ "#
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Attributes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "'''''';"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "![XML tree structure](./06_pics/xml_tree_structure.png)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "```xml\n",
+ "\n",
+ " \n",
+ " Everyday Italian\n",
+ " AAaAA\n",
+ " Giada De Laurentis\n",
+ " 2005\n",
+ " 30.00\n",
+ " \n",
+ "\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "```json\n",
+ "{\n",
+ " \"bookstore\":[\n",
+ " {\n",
+ " \"title\":\"Everyday Italian\",\n",
+ " \"lang\":\"ENG\",\n",
+ " \"author\":\"Giada de Laurentis\",\n",
+ " \"year\":2005,\n",
+ " \"price\":30\n",
+ " }\n",
+ " ]\n",
+ "}\n",
+ "```\n",
+ "\n",
+ "\n",
+ "Takeaway: JSON and XML are not equivalents and cannot be freely mirrored. Unfortunately.\n",
+ "\n",
+ "JSON cannot have multiple tags with different properties ->title_en, title_cze perhaps"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Navigation\n",
+ "* Xpath\n",
+ "* CSS selectors \n",
+ "* **BeautifulSoup**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### BeatifulSoup in detail\n",
+ "each BS object represents\n",
+ "* an element\n",
+ "* the position in tree"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "'''string on more \n",
+ "nes '''"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "xml = '''\n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " 3421\n",
+ " 1234\n",
+ " \n",
+ " \n",
+ " 3421\n",
+ " \n",
+ " \n",
+ " 1234\n",
+ " \n",
+ " \n",
+ " 1234\n",
+ " 5678\n",
+ " \n",
+ " \n",
+ " 1234\n",
+ " 5678\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Martin Hronec\n",
+ " \n",
+ " \n",
+ " Jozef Baruník\n",
+ " \n",
+ " \n",
+ " Lukáš Vácha\n",
+ " \n",
+ " \n",
+ "\n",
+ "'''\n",
+ "\n",
+ "#unlike HTML, those tag names are defined by Vitek - no one else 'can' understand them -> flexibility is limited. But same issue with JSON to be fair\n",
+ "\n",
+ "soup = BeautifulSoup(xml)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dir(soup)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "```find()``` will find a **first** element given the input\n",
+ "\n",
+ "```find_all()``` or ```findAll()``` finds a **all** elements given the input"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "soup.find_all('course')[0].find('teacher-id')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "jem059 = soup.find('course',{'id':'JEM059'}) #looking for a tag with attrbitues (optional)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "jem059"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "soup.findAll('teacher-id')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "`soup['attr']` will return the value of attribute"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(jem059['ects'])\n",
+ "print(jem059['name'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "soup.findAll('teacher-id')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "jem059"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "you can also navigate horizontally"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "jem059.findNext('course').findNext('course')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "jem059.findPrevious('course').findPrevious('course')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "and even upstream!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "jem059.parent.parent"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#get all teacher ids\n",
+ "teacher_ids = [int(t.text) for t in soup.findAll('teacher-id')]\n",
+ "print(teacher_ids)\n",
+ "#get unique\n",
+ "set(teacher_ids)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "course = soup.find('course')\n",
+ "d = {\n",
+ " 'id':course['id'],\n",
+ " 'name':course['name'],\n",
+ " 'ects':course['ects'],\n",
+ " 'teachers':[int(t.text) for t in course.findAll('teacher-id')]\n",
+ "}\n",
+ "d"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Can convert to JSON-like"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "l = []\n",
+ "for course in soup.findAll('course'):\n",
+ " d = {'id':course['id'],\n",
+ " 'name':course['name'],\n",
+ " 'ects':course['ects'],\n",
+ " 'teachers':[int(t.text) for t in course.findAll('teacher-id')]}\n",
+ " l.append(d)\n",
+ "l"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Or in list-comprehension syntax"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "l = [{\n",
+ " 'id':course['id'],\n",
+ " 'name':course['name'],\n",
+ " 'ects':course['ects'],\n",
+ " 'teachers':[int(t.text) for t in course.findAll('teacher-id')]\n",
+ "} for course in soup.findAll('course')]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "l"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.DataFrame(l)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# HTML\n",
+ "standard web-page consists of:\n",
+ "\n",
+ "* Browser-executed code (`front-end`)\n",
+ " * HTML \"DOM\" structure - the website content\n",
+ " * List of elements that are on website\n",
+ " * Links to CSS classes, ids and\n",
+ " * CSS stylesheets - website graphics\n",
+ " * JavaScripts - website interactivity \n",
+ "\n",
+ "* Server-executed (`back-end`)\n",
+ " * Server, database, app logic etc.\n",
+ " * Not available for scraping!\n",
+ " * May be available as API\n",
+ "\n",
+ "\n",
+ "## Web-scraping\n",
+ "* client side only\n",
+ "* Navigating HTML DOM by taking advantage of CSS structure\n",
+ "\n",
+ "## DOM (Document Object Module):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "html = '''\n",
+ "\n",
+ "
\n",
+ " Sample page\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ " \n",
+ " name | \n",
+ " number | \n",
+ "
\n",
+ " \n",
+ " B | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " C | \n",
+ " 3 | \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "\n",
+ "'''\n",
+ "display(HTML(html))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "soup = BeautifulSoup(html,'html')\n",
+ "soup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rows = soup.findAll('tr',{'class','normalRow'})\n",
+ "rows"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "d = {}\n",
+ "\n",
+ "for row in rows:\n",
+ " key = row.findAll('td')[0].text\n",
+ " val = int(row.findAll('td')[1].text)\n",
+ " d[key] = val\n",
+ "pd.Series(d)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "d"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.Series({\n",
+ " row.findAll('td')[0].text:int(row.findAll('td')[1].text) \n",
+ " for row in BeautifulSoup(html).findAll('tr',{'class':'normalRow'})})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "soup = BeautifulSoup(html)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "row = soup.findAll('tr',{'class':'normalRow'})[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "row"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "row.findAll('td')[0].text"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "int(row.findAll('td')[1].text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "{row.findAll('td')[0].text:int(row.findAll('td')[1].text) for row in soup.findAll('tr',{'class':'normalRow'})}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## HTML Inspection\n",
+ "http://ies.fsv.cuni.cz/cs/node/51"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import requests"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# requests and internet communication\n",
+ "\n",
+ "* `Client` asks/requests questions (your Jupyter client)\n",
+ "* `Server` replies/serve answers (your Jupyter server)\n",
+ "\n",
+ "\n",
+ "API = *Application Programming Interface*\n",
+ "\n",
+ "very general term! Not only used in web communication\n",
+ "\n",
+ "## HTTP requests\n",
+ "\n",
+ "A most standard webserver communication channel around\n",
+ "\n",
+ "A standard HTTP request contains:\n",
+ "\n",
+ "* URL \n",
+ "\n",
+ " * domain\n",
+ " * route\n",
+ " * parameters\n",
+ "\n",
+ "* Request Type - GET, POST, PUT, DELETE (see below)\n",
+ "\n",
+ "* Content specification - \n",
+ " * Application/JSON\n",
+ " * Application/XML\n",
+ " * text/html\n",
+ " * text/css\n",
+ "\n",
+ "* Content\n",
+ "\n",
+ "* Outcoming data (will see below)\n",
+ "\n",
+ "* Cookies \n",
+ "\n",
+ "* Status Code:\n",
+ "\n",
+ " * 200 - success\n",
+ " * 404 - resource does not exist\n",
+ " * 500 - the server failed during processing your request\n",
+ "\n",
+ "\n",
+ "1) REST API - use HTTP request and returns JSON\n",
+ "\n",
+ "2) SOAP API - use HTTP request and returns XML\n",
+ "\n",
+ "3) Website - use HTTP request and returns set of HTML, JavaScript, CSS and other files\n",
+ "\n",
+ "### When to use?\n",
+ "* whenever more applications need to communicate\n",
+ "* user-friendly interface for complicated tasks - DEEP AI, Google Maps\n",
+ "* Data - Golemio, OpenStreetMaps\n",
+ "\n",
+ "### GET request\n",
+ "* fast\n",
+ "* public\n",
+ "* data flow only one direction\n",
+ "* parameters via request adress\n",
+ "\n",
+ "> https://www.google.com/search?q=how+to+understand+url+parameters&rlz=1C1GCEU_csCZ860CZ860&oq=how+to+understand+url+parameters&aqs=chrome..69i57j33i22i29i30l7.5237j0j4&sourceid=chrome&ie=UTF-8\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "r = requests.get('https://cs.wikipedia.org/wiki/Institut_ekonomick%C3%BDch_studi%C3%AD_Fakulty_soci%C3%A1ln%C3%ADch_v%C4%9Bd_Univerzity_Karlovy')\n",
+ "#plain request - like browser\n",
+ "r.text"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "soup = BeautifulSoup(r.text,'html')\n",
+ "tags=soup.findAll('span', {'class':\"wd\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tags"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### POST request\n",
+ "* slow\n",
+ "* private\n",
+ "* both sides can send data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Static pages x Dynamic pages x JavaScript-rendered pages\n",
+ "\n",
+ "### Static\n",
+ "\n",
+ "* pages that do not get updated instantly\n",
+ "* all information necessary for rendering a website is available after entering the URL\n",
+ "* It may ask the database, but the output is stable.\n",
+ "* all parameters within the adress!\n",
+ "* Typical example:\n",
+ " \n",
+ "### JavaScript rendered: \n",
+ "* Defacto static, but you cannot take advantage of HTML/CSS structure\n",
+ "\n",
+ "### Dynamic content\n",
+ "* webpage instantly communicates with the webserver and the database\n",
+ "* \n",
+ "* solution -> Selenium!\n",
+ "\n",
+ "### Is this website static or dynamic?\n",
+ "\n",
+ "1. Facebook\n",
+ "2. Sreality.cz\n",
+ "3. IES website\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How to chose data source for project\n",
+ "\n",
+ "You need to know in advance what data you will download:\n",
+ "\n",
+ "1. full or satisfactory access to API\n",
+ "2. the web-page is parsable (prefer not too much javascript)\n",
+ "3. plan to generate all requests"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# APIs Example\n",
+ "### Get wiki data using GET"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#if time, return to geodata"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Lets start with a basic request"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "api_url = 'https://krcgc3uqga.execute-api.eu-central-1.amazonaws.com'\n",
+ "#this api implements three routers\n",
+ "# GET /time\n",
+ "# GET /stocks\n",
+ "# POST /hashme"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route = 'time'\n",
+ "# route /ruːt/\n",
+ "response = requests.get(f'{api_url}/{route}')\n",
+ "response.json()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# route = stocks\n",
+ "\n",
+ "route = 'stocks'\n",
+ "# route /ruːt/\n",
+ "response = requests.get(f'{api_url}/{route}')\n",
+ "print(response.json())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route = \"hashme\"\n",
+ "url = f\"https://krcgc3uqga.execute-api.eu-central-1.amazonaws.com/{route}\"\n",
+ "\n",
+ "payload = json.dumps({\n",
+ " \"name\": \"Jan Sila\"\n",
+ "})\n",
+ "headers = {\n",
+ " 'Content-Type': 'application/json'\n",
+ "}\n",
+ "\n",
+ "response = requests.post(url, headers=headers, data=payload)\n",
+ "\n",
+ "print(response.json())\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "response = requests.get('https://en.wikipedia.org/wiki/Charles_University')\n",
+ "soup = BeautifulSoup(response.text)\n",
+ "div = soup.find('div',{'id':'mw-content-text'}) # #mw-content-text > div > p:nth-child(10)texts)\n",
+ "article = ' '.join([p.text for p in div.find_all('p')])\n",
+ "print(article)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Bonus example:\n",
+ "\n",
+ "## GeoJSON\n",
+ "\n",
+ "* One standardized data format for transferring geodata\n",
+ "* Plenty of geodata out there\n",
+ "* see for example http://opendata.iprpraha.cz/CUR/OVZ/OVZ_Klima_ZnecOvzdusi_p/WGS_84/OVZ_Klima_ZnecOvzdusi_p.json\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "verbose_request = requests.get('http://opendata.iprpraha.cz/CUR/OVZ/OVZ_Klima_ZnecOvzdusi_p/WGS_84/OVZ_Klima_ZnecOvzdusi_p.json')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "print(verbose_request.status_code)\n",
+ "dir(verbose_request)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "verbose_request.json()\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "#get already json\n",
+ "d = requests.get('http://opendata.iprpraha.cz/CUR/OVZ/OVZ_Klima_ZnecOvzdusi_p/WGS_84/OVZ_Klima_ZnecOvzdusi_p.json').json()\n",
+ "\n",
+ "d['features'][0]['properties']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import branca\n",
+ "import folium\n",
+ "\n",
+ "colorscale = branca.colormap.linear.YlOrRd_09.scale(0, 5)\n",
+ "\n",
+ "def style_function(feature):\n",
+ " gridvalue = feature['properties']['GRIDVALUE']\n",
+ " return {\n",
+ " 'fillOpacity': 0.5,\n",
+ " 'weight': 0,\n",
+ " 'fillColor': colorscale(gridvalue)\n",
+ " }\n",
+ "\n",
+ "m = folium.Map(location=[50.085,14.45],zoom_start=11)\n",
+ "folium.GeoJson('http://opendata.iprpraha.cz/CUR/OVZ/OVZ_Klima_ZnecOvzdusi_p/WGS_84/OVZ_Klima_ZnecOvzdusi_p.json',style_function=style_function).add_to(m)\n",
+ "m"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/06_pics/python_json.png b/06_pics/python_json.png
new file mode 100644
index 0000000..49153dd
Binary files /dev/null and b/06_pics/python_json.png differ
diff --git a/06_pics/xml_tree_structure.png b/06_pics/xml_tree_structure.png
new file mode 100644
index 0000000..2ab6456
Binary files /dev/null and b/06_pics/xml_tree_structure.png differ