diff --git a/06_HTML_XML_JSON.ipynb b/06_HTML_XML_JSON.ipynb new file mode 100644 index 0000000..dd5c5e4 --- /dev/null +++ b/06_HTML_XML_JSON.ipynb @@ -0,0 +1,1347 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import requests\n", + "import json\n", + "\n", + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from bs4 import BeautifulSoup\n", + "\n", + "import boto3\n", + "\n", + "from IPython.display import HTML, display" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#quick warmup\n", + "\n", + "# imagine a key-value data container\n", + "inventory = {\n", + " \"001\": [{\"name\": \"Milk\", \"quantity\": 34, \"price\": 1.99}],\n", + " \"002\": [{\"name\": \"Bread\", \"quantity\": 20, \"price\": 2.49},\n", + " {\"name\": \"Nutella\", \"quantity\": 5, \"price\": 2.49}] \n", + " }\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#How would a store manager find out how much is milk\n", + "\n", + "#How do we add new item? What do we need to do?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "name_to_item = {}\n", + "for item_id, details_list in inventory.items():\n", + " for details in details_list:\n", + " # Here we are assuming product names are unique\n", + " name_to_item[details[\"name\"]] = {\"id\": item_id, \"quantity\": details[\"quantity\"], \"price\": details[\"price\"]}\n", + "#if details[\"name\"] is not unique, what happens?\n", + "print(name_to_item)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lecture 05 - JSON, XML, HTML, Requests and APIs\n", + "by Jan Šíla, Vítek Macháček
\n", + "March 26, 2024\n", + "\n", + "### Contents\n", + "\n", + "* Standardized data representation\n", + "* JSON\n", + "* XML\n", + "* Introduction to BeautifulSoup\n", + "* Basics of HTML (+ Element Inspection)\n", + "* Introduction to Requests (GET vs. POST) and APIs\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Goals:\n", + " \n", + "* work with data online/real-time data\n", + "* acquisition, processing - > results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Date exchange formats - JSON, XML\n", + "\n", + "`Language of the internet`\n", + "\n", + "* You can send/receive a message with (almost) any service\n", + "\n", + "* send .docx -> what if I do not have MS Word?\n", + "* we need a simple data format which would work on any machine (system agnostic), is general (can write anything) and is ediatable in basic editors\n", + "\n", + "* More complex than simple tables\n", + "* Highly structured - if you dont follow the rules, you are out\n", + "* Both sides need to understand the structure (comments in yaml)\n", + "* only data, no code to be run (security measure)\n", + "* distributed as text/string (to be precise as `bytes` literals) \n", + "* parsed to objects - easy to work with straight away\n", + "* Can be persisted as special files, or some data streams from APIs. \n", + "* Human readable\n", + "* Hierarchical\n", + "* Can be fetched using standard web APIs\n", + "\n", + "### Purpose\n", + "\n", + "1. Communication \n", + " * All imaginable communication channels\n", + " * Applications within single server/machine\n", + " * Only transferring of data\n", + " * Both sides need to understand the structure\n", + "\n", + "2. Storing\n", + " * self-descriptive\n", + " * human readable\n", + " * also in DBs - SQL, MongoDB etc.\n", + "\n", + "3. Standardization\n", + " * predictability\n", + " * cooperation\n", + " * spillovers from standardization\n", + "\n", + "\n", + "### Dimensionality problem\n", + "\n", + "* rich information comes at costs of data complexity \n", + "* to interrelate information, you need to high dimensionality (or A LOT of columns) or declaratory formats such as protobuf\n", + "* Strongly object-oriented\n", + "\n", + "\n", + "### 1D:\n", + "* logs\n", + "\n", + "### 2D: CSVs\n", + "* tabular data (like pandas DFs)\n", + "\n", + "### 3+D:\n", + "#### XML\n", + "* eXtensible Markup Language is a software- and hardware-independent tool for storing and transporting data.\n", + "* Officialy defined at 1998, but its roots are even older.\n", + "* XML was designed to carry data - with focus on what data is\n", + "* HTML was designed to display data - with focus on what data should look like displayed \n", + "* XML tags are not predefined like HTML tags are\n", + "* more verbose than JSON\n", + "* can have comments !actually a really cool in useful feature!\n", + "* used historically as a transaction format in many areas: \n", + " * Scientific measurements\n", + " * News information\n", + " * Wheather measurements\n", + " * Financial transactions\n", + "* Necessary to use XML parser to use in Python or in JavaScript\n", + "\n", + "\n", + "### JSON\n", + "* JavaScript Object Notation\n", + "* often *.json* files\n", + "* but also used in the web etc.\n", + "* supports standard datatypes - strings, integers, floats, lists\n", + "* No comments\n", + "* More compact, less verbose\n", + "* No closing tags\n", + "* Used EVERYWHERE, BUT [NOT LICENSED FOR EVIL](https://www.json.org/license.html). If you want to do evil stuff, use XML instead.\n", + "* Native in JavaScript and close to native in Python (dictionary)\n", + "* Jupyter Notebooks\n", + "\n", + "\n", + "* common pitfals: properly formatted JSON is different to python dict. -> check: https://jsonlint.com/\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# JSON" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# general representation of a dictionary\n", + "# emphasis on accessibility -> key-value ( hash table )\n", + "# contains records, lists, or other dictionaries\n", + "\n", + "teachers = [\n", + " {'name':'Jozef Baruník','titles':['doc.','PhDr.','Ph.D.','Bc.','Mgr.'],'ID':1234,'courses':['JEM005','JEM116','JEM059','JEM061']},\n", + " {'name':'Martin Hronec','titles':['Bc.','Mgr.'],'ID':3421,'courses':['JEM005','JEM207']},\n", + "]\n", + "\n", + "courses = {\n", + " \"JEM005\":{'name':'Advanced Econometrics','ECTS':6,'teachers':[3421,1234]},\n", + " 'JEM207':{'name':'Data Processing in Python','ECTS':5,'teachers':[3421]},\n", + " 'JEM116':{'name':'Applied Econometrics','ECTS':6,'teachers':[1234]},\n", + " 'JEM059':{'name':'Quantitative Finance I.','ECTS':6,'teachers':[1234,5678]},\n", + " 'JEM061':{'name':'Quantitative Finance II.','ECTS':6,'teachers':[1234,5678]}\n", + "}\n", + "jsondata = {'teachers':teachers,'courses':courses}\n", + "jsondata" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "is this a valid JSON?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://jsonformatter.curiousconcept.com/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![python and JSON](./06_pics/python_json.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "js = json.dumps(\n", + " jsondata\n", + ") #json formatted string!\n", + "\n", + "isinstance(js,str)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "js" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "jsondata['courses']['JEM005']['test']='test'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame(jsondata['courses']).transpose()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dfc = pd.read_json(json.dumps(jsondata['courses']),orient='index')\n", + "dfc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# lets come back to this a little later" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# eXtensible Markup Language (XML)\n", + "\n", + "* elements\n", + "* attributes\n", + "* tags\n", + "\n", + "### Tag\n", + "> <>\n", + "\n", + "### Element" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "### Convert to python data-types" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#either\n", + "'''content'''\n", + "\n", + "#or self-closing (no content)\n", + "'''''';\n", + "#
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Attributes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "'''''';" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![XML tree structure](./06_pics/xml_tree_structure.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```xml\n", + "\n", + " \n", + " Everyday Italian\n", + " AAaAA\n", + " Giada De Laurentis\n", + " 2005\n", + " 30.00\n", + " \n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```json\n", + "{\n", + " \"bookstore\":[\n", + " {\n", + " \"title\":\"Everyday Italian\",\n", + " \"lang\":\"ENG\",\n", + " \"author\":\"Giada de Laurentis\",\n", + " \"year\":2005,\n", + " \"price\":30\n", + " }\n", + " ]\n", + "}\n", + "```\n", + "\n", + "\n", + "Takeaway: JSON and XML are not equivalents and cannot be freely mirrored. Unfortunately.\n", + "\n", + "JSON cannot have multiple tags with different properties ->title_en, title_cze perhaps" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Navigation\n", + "* Xpath\n", + "* CSS selectors \n", + "* **BeautifulSoup**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### BeatifulSoup in detail\n", + "each BS object represents\n", + "* an element\n", + "* the position in tree" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "'''string on more \n", + "nes '''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "xml = '''\n", + "\n", + "\n", + " \n", + " \n", + " 3421\n", + " 1234\n", + " \n", + " \n", + " 3421\n", + " \n", + " \n", + " 1234\n", + " \n", + " \n", + " 1234\n", + " 5678\n", + " \n", + " \n", + " 1234\n", + " 5678\n", + " \n", + " \n", + " \n", + " \n", + " Martin Hronec\n", + " \n", + " \n", + " Jozef Baruník\n", + " \n", + " \n", + " Lukáš Vácha\n", + " \n", + " \n", + "\n", + "'''\n", + "\n", + "#unlike HTML, those tag names are defined by Vitek - no one else 'can' understand them -> flexibility is limited. But same issue with JSON to be fair\n", + "\n", + "soup = BeautifulSoup(xml)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dir(soup)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```find()``` will find a **first** element given the input\n", + "\n", + "```find_all()``` or ```findAll()``` finds a **all** elements given the input" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "soup.find_all('course')[0].find('teacher-id')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "jem059 = soup.find('course',{'id':'JEM059'}) #looking for a tag with attrbitues (optional)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "jem059" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "soup.findAll('teacher-id')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`soup['attr']` will return the value of attribute" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(jem059['ects'])\n", + "print(jem059['name'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "soup.findAll('teacher-id')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "jem059" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "you can also navigate horizontally" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "jem059.findNext('course').findNext('course')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "jem059.findPrevious('course').findPrevious('course')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "and even upstream!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "jem059.parent.parent" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#get all teacher ids\n", + "teacher_ids = [int(t.text) for t in soup.findAll('teacher-id')]\n", + "print(teacher_ids)\n", + "#get unique\n", + "set(teacher_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "course = soup.find('course')\n", + "d = {\n", + " 'id':course['id'],\n", + " 'name':course['name'],\n", + " 'ects':course['ects'],\n", + " 'teachers':[int(t.text) for t in course.findAll('teacher-id')]\n", + "}\n", + "d" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Can convert to JSON-like" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "l = []\n", + "for course in soup.findAll('course'):\n", + " d = {'id':course['id'],\n", + " 'name':course['name'],\n", + " 'ects':course['ects'],\n", + " 'teachers':[int(t.text) for t in course.findAll('teacher-id')]}\n", + " l.append(d)\n", + "l" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Or in list-comprehension syntax" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "l = [{\n", + " 'id':course['id'],\n", + " 'name':course['name'],\n", + " 'ects':course['ects'],\n", + " 'teachers':[int(t.text) for t in course.findAll('teacher-id')]\n", + "} for course in soup.findAll('course')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "l" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame(l)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# HTML\n", + "standard web-page consists of:\n", + "\n", + "* Browser-executed code (`front-end`)\n", + " * HTML \"DOM\" structure - the website content\n", + " * List of elements that are on website\n", + " * Links to CSS classes, ids and\n", + " * CSS stylesheets - website graphics\n", + " * JavaScripts - website interactivity \n", + "\n", + "* Server-executed (`back-end`)\n", + " * Server, database, app logic etc.\n", + " * Not available for scraping!\n", + " * May be available as API\n", + "\n", + "\n", + "## Web-scraping\n", + "* client side only\n", + "* Navigating HTML DOM by taking advantage of CSS structure\n", + "\n", + "## DOM (Document Object Module):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "html = '''\n", + "\n", + " \n", + " Sample page\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " My page header\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namenumber
B2
C3
\n", + "
\n", + "
\n", + " \n", + " \n", + "\n", + "'''\n", + "display(HTML(html))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "soup = BeautifulSoup(html,'html')\n", + "soup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rows = soup.findAll('tr',{'class','normalRow'})\n", + "rows" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d = {}\n", + "\n", + "for row in rows:\n", + " key = row.findAll('td')[0].text\n", + " val = int(row.findAll('td')[1].text)\n", + " d[key] = val\n", + "pd.Series(d)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.Series({\n", + " row.findAll('td')[0].text:int(row.findAll('td')[1].text) \n", + " for row in BeautifulSoup(html).findAll('tr',{'class':'normalRow'})})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "soup = BeautifulSoup(html)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "row = soup.findAll('tr',{'class':'normalRow'})[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "row" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "row.findAll('td')[0].text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "int(row.findAll('td')[1].text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "{row.findAll('td')[0].text:int(row.findAll('td')[1].text) for row in soup.findAll('tr',{'class':'normalRow'})}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## HTML Inspection\n", + "http://ies.fsv.cuni.cz/cs/node/51" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# requests and internet communication\n", + "\n", + "* `Client` asks/requests questions (your Jupyter client)\n", + "* `Server` replies/serve answers (your Jupyter server)\n", + "\n", + "\n", + "API = *Application Programming Interface*\n", + "\n", + "very general term! Not only used in web communication\n", + "\n", + "## HTTP requests\n", + "\n", + "A most standard webserver communication channel around\n", + "\n", + "A standard HTTP request contains:\n", + "\n", + "* URL \n", + "\n", + " * domain\n", + " * route\n", + " * parameters\n", + "\n", + "* Request Type - GET, POST, PUT, DELETE (see below)\n", + "\n", + "* Content specification - \n", + " * Application/JSON\n", + " * Application/XML\n", + " * text/html\n", + " * text/css\n", + "\n", + "* Content\n", + "\n", + "* Outcoming data (will see below)\n", + "\n", + "* Cookies \n", + "\n", + "* Status Code:\n", + "\n", + " * 200 - success\n", + " * 404 - resource does not exist\n", + " * 500 - the server failed during processing your request\n", + "\n", + "\n", + "1) REST API - use HTTP request and returns JSON\n", + "\n", + "2) SOAP API - use HTTP request and returns XML\n", + "\n", + "3) Website - use HTTP request and returns set of HTML, JavaScript, CSS and other files\n", + "\n", + "### When to use?\n", + "* whenever more applications need to communicate\n", + "* user-friendly interface for complicated tasks - DEEP AI, Google Maps\n", + "* Data - Golemio, OpenStreetMaps\n", + "\n", + "### GET request\n", + "* fast\n", + "* public\n", + "* data flow only one direction\n", + "* parameters via request adress\n", + "\n", + "> https://www.google.com/search?q=how+to+understand+url+parameters&rlz=1C1GCEU_csCZ860CZ860&oq=how+to+understand+url+parameters&aqs=chrome..69i57j33i22i29i30l7.5237j0j4&sourceid=chrome&ie=UTF-8\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "r = requests.get('https://cs.wikipedia.org/wiki/Institut_ekonomick%C3%BDch_studi%C3%AD_Fakulty_soci%C3%A1ln%C3%ADch_v%C4%9Bd_Univerzity_Karlovy')\n", + "#plain request - like browser\n", + "r.text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "soup = BeautifulSoup(r.text,'html')\n", + "tags=soup.findAll('span', {'class':\"wd\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tags" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### POST request\n", + "* slow\n", + "* private\n", + "* both sides can send data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Static pages x Dynamic pages x JavaScript-rendered pages\n", + "\n", + "### Static\n", + "\n", + "* pages that do not get updated instantly\n", + "* all information necessary for rendering a website is available after entering the URL\n", + "* It may ask the database, but the output is stable.\n", + "* all parameters within the adress!\n", + "* Typical example:\n", + " \n", + "### JavaScript rendered: \n", + "* Defacto static, but you cannot take advantage of HTML/CSS structure\n", + "\n", + "### Dynamic content\n", + "* webpage instantly communicates with the webserver and the database\n", + "* \n", + "* solution -> Selenium!\n", + "\n", + "### Is this website static or dynamic?\n", + "\n", + "1. Facebook\n", + "2. Sreality.cz\n", + "3. IES website\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How to chose data source for project\n", + "\n", + "You need to know in advance what data you will download:\n", + "\n", + "1. full or satisfactory access to API\n", + "2. the web-page is parsable (prefer not too much javascript)\n", + "3. plan to generate all requests" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# APIs Example\n", + "### Get wiki data using GET" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#if time, return to geodata" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lets start with a basic request" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "api_url = 'https://krcgc3uqga.execute-api.eu-central-1.amazonaws.com'\n", + "#this api implements three routers\n", + "# GET /time\n", + "# GET /stocks\n", + "# POST /hashme" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "route = 'time'\n", + "# route /ruːt/\n", + "response = requests.get(f'{api_url}/{route}')\n", + "response.json()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# route = stocks\n", + "\n", + "route = 'stocks'\n", + "# route /ruːt/\n", + "response = requests.get(f'{api_url}/{route}')\n", + "print(response.json())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "route = \"hashme\"\n", + "url = f\"https://krcgc3uqga.execute-api.eu-central-1.amazonaws.com/{route}\"\n", + "\n", + "payload = json.dumps({\n", + " \"name\": \"Jan Sila\"\n", + "})\n", + "headers = {\n", + " 'Content-Type': 'application/json'\n", + "}\n", + "\n", + "response = requests.post(url, headers=headers, data=payload)\n", + "\n", + "print(response.json())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = requests.get('https://en.wikipedia.org/wiki/Charles_University')\n", + "soup = BeautifulSoup(response.text)\n", + "div = soup.find('div',{'id':'mw-content-text'}) # #mw-content-text > div > p:nth-child(10)texts)\n", + "article = ' '.join([p.text for p in div.find_all('p')])\n", + "print(article)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bonus example:\n", + "\n", + "## GeoJSON\n", + "\n", + "* One standardized data format for transferring geodata\n", + "* Plenty of geodata out there\n", + "* see for example http://opendata.iprpraha.cz/CUR/OVZ/OVZ_Klima_ZnecOvzdusi_p/WGS_84/OVZ_Klima_ZnecOvzdusi_p.json\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "verbose_request = requests.get('http://opendata.iprpraha.cz/CUR/OVZ/OVZ_Klima_ZnecOvzdusi_p/WGS_84/OVZ_Klima_ZnecOvzdusi_p.json')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "print(verbose_request.status_code)\n", + "dir(verbose_request)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "verbose_request.json()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "#get already json\n", + "d = requests.get('http://opendata.iprpraha.cz/CUR/OVZ/OVZ_Klima_ZnecOvzdusi_p/WGS_84/OVZ_Klima_ZnecOvzdusi_p.json').json()\n", + "\n", + "d['features'][0]['properties']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "import branca\n", + "import folium\n", + "\n", + "colorscale = branca.colormap.linear.YlOrRd_09.scale(0, 5)\n", + "\n", + "def style_function(feature):\n", + " gridvalue = feature['properties']['GRIDVALUE']\n", + " return {\n", + " 'fillOpacity': 0.5,\n", + " 'weight': 0,\n", + " 'fillColor': colorscale(gridvalue)\n", + " }\n", + "\n", + "m = folium.Map(location=[50.085,14.45],zoom_start=11)\n", + "folium.GeoJson('http://opendata.iprpraha.cz/CUR/OVZ/OVZ_Klima_ZnecOvzdusi_p/WGS_84/OVZ_Klima_ZnecOvzdusi_p.json',style_function=style_function).add_to(m)\n", + "m" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/06_pics/python_json.png b/06_pics/python_json.png new file mode 100644 index 0000000..49153dd Binary files /dev/null and b/06_pics/python_json.png differ diff --git a/06_pics/xml_tree_structure.png b/06_pics/xml_tree_structure.png new file mode 100644 index 0000000..2ab6456 Binary files /dev/null and b/06_pics/xml_tree_structure.png differ