From a308b7195c95282df040f062b06c2631bf688b60 Mon Sep 17 00:00:00 2001 From: PAlena <86962990+PAlena@users.noreply.github.com> Date: Tue, 2 Apr 2024 15:56:05 +0200 Subject: [PATCH] Add files via upload --- Seminar_3/seminar_empty_3.ipynb | 334 ++++++++++++++++++++++++++++++++ 1 file changed, 334 insertions(+) create mode 100644 Seminar_3/seminar_empty_3.ipynb diff --git a/Seminar_3/seminar_empty_3.ipynb b/Seminar_3/seminar_empty_3.ipynb new file mode 100644 index 0000000..7a016b0 --- /dev/null +++ b/Seminar_3/seminar_empty_3.ipynb @@ -0,0 +1,334 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "492eb907-b454-403f-914a-281a61751e07", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import pandas as pd\n", + "import time" + ] + }, + { + "cell_type": "markdown", + "id": "9b9babd9-e6ad-49bd-9e5e-c479363d305b", + "metadata": { + "tags": [] + }, + "source": [ + "# Seminar - APIs, DBs and Live coding" + ] + }, + { + "cell_type": "markdown", + "id": "4d7941ab-1155-4c89-8095-94edf11f889d", + "metadata": { + "tags": [] + }, + "source": [ + "## Task 1: Requesting API\n", + "### 1a. Create a function requesting data from sreality\n", + "\n", + "\n", + "```python\n", + "base_url = 'https://www.sreality.cz/api/cs/v2/estates?category_main_cb=1&category_type_cb=1&locality_region_id=10&per_page60&page={}'.format(i)\n", + "\n", + "r = requests.get(base_url)\n", + "d = r.json()\n", + "```\n", + "\n", + "* function should parametrize: \n", + " * `category_main_cb` - `{'flat':1, 'house':2, 'land':3 }`\n", + " * `category_type_cb` - `{'sell':1,'rent':2}`\n", + " * `locality_region_id` - use 10 as default value\n", + " * `page` parameter\n", + "* use string inputs for `category_main_cb` and `category_type_cb`\n", + "* include try/except clause to handle errors\n", + "* function should return JSON data in python types\n", + "* do not forget to sleep each request at least 0.5s" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "55cca328-f8c4-4fba-a0b3-6d20c990d712", + "metadata": {}, + "outputs": [], + "source": [ + "def request_sreality(page, category_main_str, category_type_str, locality_region_id=10):\n", + " category_mains = {'flat':1, 'house':2, 'land':3 }\n", + " category_types = {'sell':1,'rent':2}\n", + " template_url = 'https://www.sreality.cz/api/cs/v2/estates?category_main_cb={category_main}&category_type_cb={category_type}&locality_region_id={locality_region_id}&per_page60&page={page}'\n", + " request_url = template_url.format(\n", + " category_main=category_mains[category_main_str],\n", + " category_type=category_types[category_type_str],\n", + " locality_region_id=locality_region_id,\n", + " page=page\n", + " )\n", + " r = requests.get(request_url)\n", + " return r.json()\n", + "d = request_sreality(0, 'flat', 'sell', 10)" + ] + }, + { + "cell_type": "markdown", + "id": "8b718701-e4a4-4fe4-bf34-d03913765b2a", + "metadata": {}, + "source": [ + "### 1b. Create a function converting sreality json data into pandas dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "71e52613-8e0e-4b5a-a579-76d803eafa31", + "metadata": {}, + "outputs": [], + "source": [ + "def convert_sreality_data_to_df(sreality_data):\n", + " return\n", + "\n", + "raw = convert_sreality_data_to_df(d)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "219610c5-fea8-487c-8682-2e803d1fc2d1", + "metadata": {}, + "outputs": [], + "source": [ + "raw.head()" + ] + }, + { + "cell_type": "markdown", + "id": "fc2cde54-c6c3-4baf-9e4c-b740d8eb4dbd", + "metadata": { + "tags": [] + }, + "source": [ + "### 1c. link function `1b` into function `1a`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8da8611-df45-4f30-87d6-8059f61f810d", + "metadata": {}, + "outputs": [], + "source": [ + "df = request_sreality(0, 'flat', 'sell', 10)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "4ebab853-b6f2-4335-b13a-6c3cbba1951b", + "metadata": {}, + "source": [ + "### 1c. Combining multiple requests into single df\n", + "\n", + "* Function should parametrize:\n", + " * `start_page` and `end_page`\n", + " * request parameters\n", + "* construct a list of individual request dfs\n", + "* then feed it into `pd.concat` function" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "bc61d311-c46a-4aee-a004-8349ec3ce0de", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(21, 27)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "b1d9bef1-7e5c-4648-89a4-6f472968f3c6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "request_sreality" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "284687ef-aba6-4bbf-b7bf-c42dafda4cb4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(103, 27)" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def request_multiply_sreality(start_page, end_page, category_main_str, category_type_str, locality_region_id=10):\n", + " \n", + " return pd.concat(list_of_dfs)\n", + "\n", + "df = request_multiply_sreality(1, 5, 'flat', 'sell',10)\n", + "df.shape" + ] + }, + { + "cell_type": "markdown", + "id": "bdde40e7-f68e-4859-878e-772c112f7355", + "metadata": {}, + "source": [ + "## Task 2: Cleaning data\n", + "\n", + "### 2a. Filter columns\n", + "* filter only columns: `['locality', 'price', 'name', 'gps','hash_id','exclusively_at_rk']`\n", + "* use `.copy()` to avoid `SettingWithCopyWarning` later\n" + ] + }, + { + "cell_type": "markdown", + "id": "80deec04-4959-4d9a-8a3a-7cf616e8558a", + "metadata": { + "tags": [] + }, + "source": [ + "### 2b: GPS\n", + "* Convert dictionary in `gps` column into two columns - `lat` and `lon`\n", + "* use apply function on gps column\n", + "* Note apply can return multiple columns" + ] + }, + { + "cell_type": "markdown", + "id": "36c22408-c327-4c17-b1b4-de54f63f0627", + "metadata": {}, + "source": [ + "### 2b. Get flat type from name\n", + "* Name is always represented by string `Prodej bytu [type of flat] [Area] m^2`\n", + "* try picking third word in string\n", + "* check meaningfulness using `.value_counts()`" + ] + }, + { + "cell_type": "markdown", + "id": "5e233b14-db62-41f8-be82-45c861d62e3e", + "metadata": {}, + "source": [ + "### 2c. Get area from name\n", + "* Naive: select the word before last word\n", + "* Then try navigating using the index of `'m²'`\n", + "* if this also fail, then you will need to use regex" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "150ff188-b6d0-4326-95c3-cfc30e6fcb03", + "metadata": {}, + "outputs": [], + "source": [ + "def name_to_area(nm):\n", + "\n", + " return \n", + "\n", + "clean['area_2'] = clean.name.apply(name_to_area)\n", + "clean" + ] + }, + { + "cell_type": "markdown", + "id": "ce71f809-7a5a-487e-882a-6aa9c7124727", + "metadata": {}, + "source": [ + "## Bonus: Convert `labelsAll` into categorical variables\n", + "\n", + "### Task 4a. Get all possible label names\n", + "* deal with nested-list structure\n", + "* Hint: try to sum the whole column\n", + "* Needed to Iterate through all labels in all rows and " + ] + }, + { + "cell_type": "markdown", + "id": "db0b86aa-57b0-439d-a82f-1d8f962be7c2", + "metadata": {}, + "source": [ + "### 4b. Test existence of label `cellar` for offers\n", + "* again deal with nested list of list structure\n", + "* write generic function `test_existence_of_label(offer_labels,label)`" + ] + }, + { + "cell_type": "markdown", + "id": "d5e22365-b2d8-4c57-a0cd-7297efb8b948", + "metadata": {}, + "source": [ + "### 4c. Test existence of all possible labels\n", + "* use apply returning series with all labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8eceb6c0-9af6-4fb9-b178-f371dd453d39", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}