diff --git a/Seminar3/seminar_3_solved.ipynb b/Seminar3/seminar_3_solved.ipynb new file mode 100644 index 0000000..ae7000f --- /dev/null +++ b/Seminar3/seminar_3_solved.ipynb @@ -0,0 +1,3097 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "492eb907-b454-403f-914a-281a61751e07", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import pandas as pd\n", + "import time" + ] + }, + { + "cell_type": "markdown", + "id": "9b9babd9-e6ad-49bd-9e5e-c479363d305b", + "metadata": { + "tags": [] + }, + "source": [ + "# Seminar - APIs, DBs and Live coding" + ] + }, + { + "cell_type": "markdown", + "id": "4d7941ab-1155-4c89-8095-94edf11f889d", + "metadata": { + "tags": [] + }, + "source": [ + "## Task 1: Requesting API\n", + "### 1a. Create a function requesting data from sreality\n", + "\n", + "\n", + "```python\n", + "base_url = 'https://www.sreality.cz/api/cs/v2/estates?category_main_cb=1&category_type_cb=1&locality_region_id=10&per_page60&page={}'.format(i)\n", + "\n", + "r = requests.get(base_url)\n", + "d = r.json()\n", + "```\n", + "\n", + "* function should parametrize: \n", + " * `category_main_cb` - `{'flat':1, 'house':2, 'land':3 }`\n", + " * `category_type_cb` - `{'sell':1,'rent':2}`\n", + " * `locality_region_id` - use 10 as default value\n", + " * `page` parameter\n", + "* use string inputs for `category_main_cb` and `category_type_cb`\n", + "* include try/except clause to handle errors\n", + "* function should return JSON data in python types\n", + "* do not forget to sleep each request at least 0.5s" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3bf7c9dc-be77-48ad-b373-c1525983da7a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hi\n", + "CPU times: user 74 µs, sys: 15 µs, total: 89 µs\n", + "Wall time: 99.2 µs\n" + ] + } + ], + "source": [ + "%%time\n", + "print('Hi')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8d075a9e-e094-483d-9a9f-2fd5a7eae194", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.6 ms, sys: 1.12 ms, total: 2.72 ms\n", + "Wall time: 5.01 s\n" + ] + } + ], + "source": [ + "%%time\n", + "time.sleep(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b8758ddd-8357-4b98-b9ac-e8cf0c3629b4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 2.35 ms, sys: 0 ns, total: 2.35 ms\n", + "Wall time: 1 s\n" + ] + } + ], + "source": [ + "%%time\n", + "time.sleep(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "55cca328-f8c4-4fba-a0b3-6d20c990d712", + "metadata": {}, + "outputs": [], + "source": [ + "def request_sreality(page, category_main_str, category_type_str, locality_region_id=10):\n", + " time.sleep(0.5)\n", + " category_mains = {'flat':1, 'house':2, 'land':3 }\n", + " category_types = {'sell':1,'rent':2}\n", + " template_url = 'https://www.sreality.cz/api/cs/v2/estates?category_main_cb={category_main}&category_type_cb={category_type}&locality_region_id={locality_region_id}&per_page60&page={page}'\n", + " try:\n", + " request_url = template_url.format(\n", + " category_main=category_mains[category_main_str],\n", + " category_type=category_types[category_type_str],\n", + " locality_region_id=locality_region_id,\n", + " page=page\n", + " )\n", + " r = requests.get(request_url)\n", + " return r.json()\n", + " except Exception as e:\n", + " print(e)\n", + "d = request_sreality(0, 'flat', 'sell', 10)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "1a40af97-3f19-4a45-ab62-13ed539b6c86", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['meta_description', 'result_size', '_embedded', 'filterLabels', 'title', 'filter', '_links', 'locality', 'locality_dativ', 'logged_in', 'per_page', 'category_instrumental', 'page', 'filterLabels2'])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "768f1e7e-5330-4f55-b846-7bd02252d45b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'5045 realit v nabídce prodej bytů Praha. Vyberte si novou nemovitost na sreality.cz s hledáním na mapě a velkými náhledy fotografií nabízených bytů.'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d['meta_description']" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "78942b2d-e947-47e0-8dc2-fe333f7c31dc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5045" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d['result_size']" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "32a6da59-18de-4ce0-8127-873f6ec29a0a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['estates', 'is_saved', 'not_precise_location_count'])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d['_embedded'].keys()" + ] + }, + { + "cell_type": "markdown", + "id": "8b718701-e4a4-4fe4-bf34-d03913765b2a", + "metadata": {}, + "source": [ + "### 1b. Create a function converting sreality json data into pandas dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "c1eae9b3-0571-4699-9933-3868f362ef83", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "21" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(d['_embedded']['estates'])" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "1c60d718-0c53-4b00-9de3-234c72e938ca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "27" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(d['_embedded']['estates'][4].keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "99e70448-9475-465b-a85c-e4bc5cd1778a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
044443
1144454
24555455553
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 44 443\n", + "1 14 4454\n", + "2 45554 55553" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_lists = [{'a':44, 'b':443},{'a':14, 'b':4454},{'a':45554, 'b':55553}]\n", + "pd.DataFrame(data_lists)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "71e52613-8e0e-4b5a-a579-76d803eafa31", + "metadata": {}, + "outputs": [], + "source": [ + "def convert_sreality_data_to_df(sreality_data):\n", + " return pd.DataFrame(sreality_data['_embedded']['estates'])\n", + "\n", + "raw = convert_sreality_data_to_df(d)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "219610c5-fea8-487c-8682-2e803d1fc2d1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
labelsReleasedhas_panoramalabelsis_auctionlabelsAllseoexclusively_at_rkcategoryhas_floor_plan_embedded...hash_idattractive_offerpriceprice_czk_linksrusnameregion_tipgpshas_matterport_url
0[[balcony, parking_lots, garage], []]0[Balkon, Parkování, Garáž]False[[personal, balcony, brick, elevator, parking_...{'category_main_cb': 1, 'category_sub_cb': 8, ...010{'favourite': {'is_favourite': False, '_links'......58234188012862000{'value_raw': 12862000, 'unit': '', 'name': 'C...{'dynamicDown': [{'href': 'https://d18-a.sdn.c...FalseProdej bytu 4+kk 128 m²2693402{'lat': 50.12603618747833, 'lon': 14.561554812...False
1[[not_furnished], [restaurant, drugstore]]0[Nevybavený, Restaurace 1 min. pěšky, Lékárna ...False[[new_building, personal, elevator, not_furnis...{'category_main_cb': 1, 'category_sub_cb': 2, ...011{'favourite': {'is_favourite': False, '_links'......8942906803990000{'value_raw': 3990000, 'unit': '', 'name': 'Ce...{'dynamicDown': [{'href': 'https://d18-a.sdn.c...FalseProdej bytu 1+kk 24 m²0{'lat': 50.09041518747833, 'lon': 14.531943812...False
2[[], []]0[]False[[new_building, personal, brick, cellar, eleva...{'category_main_cb': 1, 'category_sub_cb': 6, ...011{'favourite': {'is_favourite': False, '_links'......567759948021978000{'value_raw': 21978000, 'unit': '', 'name': 'C...{'dynamicDown': [{'href': 'https://d18-a.sdn.c...FalseProdej bytu 3+kk 122 m²0{'lat': 50.06292218747833, 'lon': 14.381577812...False
3[[], []]0[]False[[new_building, personal, brick, cellar, eleva...{'category_main_cb': 1, 'category_sub_cb': 6, ...011{'favourite': {'is_favourite': False, '_links'......618091596018559000{'value_raw': 18559000, 'unit': '', 'name': 'C...{'dynamicDown': [{'href': 'https://d18-a.sdn.c...FalseProdej bytu 3+kk 103 m²0{'lat': 50.06292218747833, 'lon': 14.381577812...False
4[[], [kindergarten, drugstore]]0[Školka 6 min. pěšky, Lékárna 5 min. pěšky]False[[new_building, personal, brick], [candy_shop,...{'category_main_cb': 1, 'category_sub_cb': 8, ...011{'favourite': {'is_favourite': False, '_links'......973042764021876000{'value_raw': 21876000, 'unit': '', 'name': 'C...{'dynamicDown': [{'href': 'https://d18-a.sdn.c...FalseProdej bytu 4+kk 139 m²0{'lat': 50.06782018747833, 'lon': 14.507568812...True
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " labelsReleased has_panorama \\\n", + "0 [[balcony, parking_lots, garage], []] 0 \n", + "1 [[not_furnished], [restaurant, drugstore]] 0 \n", + "2 [[], []] 0 \n", + "3 [[], []] 0 \n", + "4 [[], [kindergarten, drugstore]] 0 \n", + "\n", + " labels is_auction \\\n", + "0 [Balkon, Parkování, Garáž] False \n", + "1 [Nevybavený, Restaurace 1 min. pěšky, Lékárna ... False \n", + "2 [] False \n", + "3 [] False \n", + "4 [Školka 6 min. pěšky, Lékárna 5 min. pěšky] False \n", + "\n", + " labelsAll \\\n", + "0 [[personal, balcony, brick, elevator, parking_... \n", + "1 [[new_building, personal, elevator, not_furnis... \n", + "2 [[new_building, personal, brick, cellar, eleva... \n", + "3 [[new_building, personal, brick, cellar, eleva... \n", + "4 [[new_building, personal, brick], [candy_shop,... \n", + "\n", + " seo exclusively_at_rk \\\n", + "0 {'category_main_cb': 1, 'category_sub_cb': 8, ... 0 \n", + "1 {'category_main_cb': 1, 'category_sub_cb': 2, ... 0 \n", + "2 {'category_main_cb': 1, 'category_sub_cb': 6, ... 0 \n", + "3 {'category_main_cb': 1, 'category_sub_cb': 6, ... 0 \n", + "4 {'category_main_cb': 1, 'category_sub_cb': 8, ... 0 \n", + "\n", + " category has_floor_plan \\\n", + "0 1 0 \n", + "1 1 1 \n", + "2 1 1 \n", + "3 1 1 \n", + "4 1 1 \n", + "\n", + " _embedded ... hash_id \\\n", + "0 {'favourite': {'is_favourite': False, '_links'... ... 58234188 \n", + "1 {'favourite': {'is_favourite': False, '_links'... ... 89429068 \n", + "2 {'favourite': {'is_favourite': False, '_links'... ... 567759948 \n", + "3 {'favourite': {'is_favourite': False, '_links'... ... 618091596 \n", + "4 {'favourite': {'is_favourite': False, '_links'... ... 973042764 \n", + "\n", + " attractive_offer price \\\n", + "0 0 12862000 \n", + "1 0 3990000 \n", + "2 0 21978000 \n", + "3 0 18559000 \n", + "4 0 21876000 \n", + "\n", + " price_czk \\\n", + "0 {'value_raw': 12862000, 'unit': '', 'name': 'C... \n", + "1 {'value_raw': 3990000, 'unit': '', 'name': 'Ce... \n", + "2 {'value_raw': 21978000, 'unit': '', 'name': 'C... \n", + "3 {'value_raw': 18559000, 'unit': '', 'name': 'C... \n", + "4 {'value_raw': 21876000, 'unit': '', 'name': 'C... \n", + "\n", + " _links rus \\\n", + "0 {'dynamicDown': [{'href': 'https://d18-a.sdn.c... False \n", + "1 {'dynamicDown': [{'href': 'https://d18-a.sdn.c... False \n", + "2 {'dynamicDown': [{'href': 'https://d18-a.sdn.c... False \n", + "3 {'dynamicDown': [{'href': 'https://d18-a.sdn.c... False \n", + "4 {'dynamicDown': [{'href': 'https://d18-a.sdn.c... False \n", + "\n", + " name region_tip \\\n", + "0 Prodej bytu 4+kk 128 m² 2693402 \n", + "1 Prodej bytu 1+kk 24 m² 0 \n", + "2 Prodej bytu 3+kk 122 m² 0 \n", + "3 Prodej bytu 3+kk 103 m² 0 \n", + "4 Prodej bytu 4+kk 139 m² 0 \n", + "\n", + " gps has_matterport_url \n", + "0 {'lat': 50.12603618747833, 'lon': 14.561554812... False \n", + "1 {'lat': 50.09041518747833, 'lon': 14.531943812... False \n", + "2 {'lat': 50.06292218747833, 'lon': 14.381577812... False \n", + "3 {'lat': 50.06292218747833, 'lon': 14.381577812... False \n", + "4 {'lat': 50.06782018747833, 'lon': 14.507568812... True \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw.head()" + ] + }, + { + "cell_type": "markdown", + "id": "fc2cde54-c6c3-4baf-9e4c-b740d8eb4dbd", + "metadata": { + "tags": [] + }, + "source": [ + "### 1c. link function `1b` into function `1a`" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "e8da8611-df45-4f30-87d6-8059f61f810d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
labelsReleasedhas_panoramalabelsis_auctionlabelsAllseoexclusively_at_rkcategoryhas_floor_plan_embedded...hash_idattractive_offerpriceprice_czk_linksrusnameregion_tipgpshas_matterport_url
0[[new_building, garage], []]0[Novostavba, Garáž]False[[new_building, personal, terrace, elevator, p...{'category_main_cb': 1, 'category_sub_cb': 6, ...011{'favourite': {'is_favourite': False, '_links'......568091724021760000{'value_raw': 21760000, 'unit': '', 'name': 'C...{'dynamicDown': [{'href': 'https://d18-a.sdn.c...FalseProdej bytu 3+kk 123 m²0{'lat': 50.06301418747833, 'lon': 14.376991812...False
1[[], [post_office, medic]]0[Pošta 6 min. pěšky, Lékař 6 min. pěšky]False[[personal, brick], [candy_shop, small_shop, t...{'category_main_cb': 1, 'category_sub_cb': 4, ...010{'favourite': {'is_favourite': False, '_links'......2946720844024335000{'value_raw': 24335000, 'unit': '', 'name': 'C...{'dynamicDown': [{'href': 'https://d18-a.sdn.c...FalseProdej bytu 2+kk 160 m²0{'lat': 50.07837518747833, 'lon': 14.436064812...False
2[[], [metro, shop]]0[Metro 5 min. pěšky, Obchod 5 min. pěšky]False[[personal, balcony, cellar, elevator, parking...{'category_main_cb': 1, 'category_sub_cb': 6, ...010{'favourite': {'is_favourite': False, '_links'......400340300014034000{'value_raw': 14034000, 'unit': '', 'name': 'C...{'dynamicDown': [{'href': 'https://d18-a.sdn.c...FalseProdej bytu 3+kk 108 m²0{'lat': 50.03316718747833, 'lon': 14.336494812...False
3[[after_reconstruction], [metro, shop]]0[Po rekonstrukci, Metro 2 min. pěšky, Obchod 3...False[[personal, after_reconstruction, brick, parki...{'category_main_cb': 1, 'category_sub_cb': 2, ...011{'favourite': {'is_favourite': False, '_links'......114434158007017000{'value_raw': 7017000, 'unit': '', 'name': 'Ce...{'dynamicDown': [{'href': 'https://d18-a.sdn.c...FalseProdej bytu 1+kk 39 m²0{'lat': 50.05947018747833, 'lon': 14.419744812...False
4[[], [post_office]]0[Pošta 6 min. pěšky]False[[personal, terrace, elevator], [small_shop, t...{'category_main_cb': 1, 'category_sub_cb': 4, ...011{'favourite': {'is_favourite': False, '_links'......162754337208694000{'value_raw': 8694000, 'unit': '', 'name': 'Ce...{'dynamicDown': [{'href': 'https://d18-a.sdn.c...FalseProdej bytu 2+kk 46 m²0{'lat': 50.092200187478326, 'lon': 14.46233681...False
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " labelsReleased has_panorama \\\n", + "0 [[new_building, garage], []] 0 \n", + "1 [[], [post_office, medic]] 0 \n", + "2 [[], [metro, shop]] 0 \n", + "3 [[after_reconstruction], [metro, shop]] 0 \n", + "4 [[], [post_office]] 0 \n", + "\n", + " labels is_auction \\\n", + "0 [Novostavba, Garáž] False \n", + "1 [Pošta 6 min. pěšky, Lékař 6 min. pěšky] False \n", + "2 [Metro 5 min. pěšky, Obchod 5 min. pěšky] False \n", + "3 [Po rekonstrukci, Metro 2 min. pěšky, Obchod 3... False \n", + "4 [Pošta 6 min. pěšky] False \n", + "\n", + " labelsAll \\\n", + "0 [[new_building, personal, terrace, elevator, p... \n", + "1 [[personal, brick], [candy_shop, small_shop, t... \n", + "2 [[personal, balcony, cellar, elevator, parking... \n", + "3 [[personal, after_reconstruction, brick, parki... \n", + "4 [[personal, terrace, elevator], [small_shop, t... \n", + "\n", + " seo exclusively_at_rk \\\n", + "0 {'category_main_cb': 1, 'category_sub_cb': 6, ... 0 \n", + "1 {'category_main_cb': 1, 'category_sub_cb': 4, ... 0 \n", + "2 {'category_main_cb': 1, 'category_sub_cb': 6, ... 0 \n", + "3 {'category_main_cb': 1, 'category_sub_cb': 2, ... 0 \n", + "4 {'category_main_cb': 1, 'category_sub_cb': 4, ... 0 \n", + "\n", + " category has_floor_plan \\\n", + "0 1 1 \n", + "1 1 0 \n", + "2 1 0 \n", + "3 1 1 \n", + "4 1 1 \n", + "\n", + " _embedded ... hash_id \\\n", + "0 {'favourite': {'is_favourite': False, '_links'... ... 568091724 \n", + "1 {'favourite': {'is_favourite': False, '_links'... ... 2946720844 \n", + "2 {'favourite': {'is_favourite': False, '_links'... ... 400340300 \n", + "3 {'favourite': {'is_favourite': False, '_links'... ... 1144341580 \n", + "4 {'favourite': {'is_favourite': False, '_links'... ... 1627543372 \n", + "\n", + " attractive_offer price \\\n", + "0 0 21760000 \n", + "1 0 24335000 \n", + "2 0 14034000 \n", + "3 0 7017000 \n", + "4 0 8694000 \n", + "\n", + " price_czk \\\n", + "0 {'value_raw': 21760000, 'unit': '', 'name': 'C... \n", + "1 {'value_raw': 24335000, 'unit': '', 'name': 'C... \n", + "2 {'value_raw': 14034000, 'unit': '', 'name': 'C... \n", + "3 {'value_raw': 7017000, 'unit': '', 'name': 'Ce... \n", + "4 {'value_raw': 8694000, 'unit': '', 'name': 'Ce... \n", + "\n", + " _links rus \\\n", + "0 {'dynamicDown': [{'href': 'https://d18-a.sdn.c... False \n", + "1 {'dynamicDown': [{'href': 'https://d18-a.sdn.c... False \n", + "2 {'dynamicDown': [{'href': 'https://d18-a.sdn.c... False \n", + "3 {'dynamicDown': [{'href': 'https://d18-a.sdn.c... False \n", + "4 {'dynamicDown': [{'href': 'https://d18-a.sdn.c... False \n", + "\n", + " name region_tip \\\n", + "0 Prodej bytu 3+kk 123 m² 0 \n", + "1 Prodej bytu 2+kk 160 m² 0 \n", + "2 Prodej bytu 3+kk 108 m² 0 \n", + "3 Prodej bytu 1+kk 39 m² 0 \n", + "4 Prodej bytu 2+kk 46 m² 0 \n", + "\n", + " gps has_matterport_url \n", + "0 {'lat': 50.06301418747833, 'lon': 14.376991812... False \n", + "1 {'lat': 50.07837518747833, 'lon': 14.436064812... False \n", + "2 {'lat': 50.03316718747833, 'lon': 14.336494812... False \n", + "3 {'lat': 50.05947018747833, 'lon': 14.419744812... False \n", + "4 {'lat': 50.092200187478326, 'lon': 14.46233681... False \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def request_sreality(page, category_main_str, category_type_str, locality_region_id=10):\n", + " category_mains = {'flat':1, 'house':2, 'land':3 }\n", + " category_types = {'sell':1,'rent':2}\n", + " template_url = 'https://www.sreality.cz/api/cs/v2/estates?category_main_cb={category_main}&category_type_cb={category_type}&locality_region_id={locality_region_id}&per_page60&page={page}'\n", + " \n", + " request_url = template_url.format(\n", + " category_main=category_mains[category_main_str],\n", + " category_type=category_types[category_type_str],\n", + " locality_region_id=locality_region_id,\n", + " page=page\n", + " )\n", + " \n", + " try: \n", + " r = requests.get(request_url)\n", + " return convert_sreality_data_to_df(r.json())\n", + " except Exception as e:\n", + " print(f'error requesting url {request_url}. Reason: {e.message}')\n", + " \n", + "df = request_sreality(0, 'flat', 'sell', 10)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "4ebab853-b6f2-4335-b13a-6c3cbba1951b", + "metadata": {}, + "source": [ + "### 1c. Combining multiple requests into single df\n", + "\n", + "* Function should parametrize:\n", + " * `start_page` and `end_page`\n", + " * request parameters\n", + "* construct a list of individual request dfs\n", + "* then feed it into `pd.concat` function" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "bc61d311-c46a-4aee-a004-8349ec3ce0de", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(21, 27)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "b1d9bef1-7e5c-4648-89a4-6f472968f3c6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "request_sreality" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "284687ef-aba6-4bbf-b7bf-c42dafda4cb4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(103, 27)" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def request_multiply_sreality(start_page, end_page, category_main_str, category_type_str, locality_region_id=10):\n", + " pages = range(start_page, end_page + 1)\n", + " list_of_dfs = [request_sreality(page, category_main_str, category_type_str, locality_region_id) for page in pages]\n", + " return pd.concat(list_of_dfs)\n", + "\n", + "df = request_multiply_sreality(1, 5, 'flat', 'sell',10)\n", + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "cb5b33f7-fce3-4331-9d3e-ecc7b5184253", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.reset_index().drop('index', axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "bdde40e7-f68e-4859-878e-772c112f7355", + "metadata": {}, + "source": [ + "## Task 2: Cleaning data\n", + "\n", + "### 2a. Filter columns\n", + "* filter only columns: `['locality', 'price', 'name', 'gps','hash_id','exclusively_at_rk']`\n", + "* use `.copy()` to avoid `SettingWithCopyWarning` later\n" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "34d14f44-48f4-4bcd-bac0-ddf282242464", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
localitypricenamegpshash_idexclusively_at_rk
0Praha 9 - Kbely12862000Prodej bytu 4+kk 128 m²{'lat': 50.12603618747833, 'lon': 14.561554812...582341880
1Praha 2 - Vinohrady21566000Prodej bytu 2+kk 126 m²{'lat': 50.06495918747833, 'lon': 14.454340812...41077899000
2Praha 5 - Sobín17382000Prodej bytu 3+kk 97 m²{'lat': 50.052054187478326, 'lon': 14.28598081...19728725241
3Praha 5 - Stodůlky18286000Prodej bytu 4+kk 122 m²{'lat': 50.02775118747833, 'lon': 14.324684812...8663509240
4Praha 5 - Stodůlky14140000Prodej bytu 3+kk 88 m²{'lat': 50.02775118747833, 'lon': 14.324684812...37352548600
.....................
98Praha 8 - Karlín10236000Prodej bytu 1+kk 60 m²{'lat': 50.08081318747833, 'lon': 14.459052812...19180022520
99Praha 4 - Michle29614000Prodej bytu 3+kk 272 m²{'lat': 50.03685218747833, 'lon': 14.467224812...28106192121
100Praha 4 - Modřany14018000Prodej bytu 3+kk 100 m²{'lat': 49.989115187478326, 'lon': 14.41775681...15670208760
101Praha 9 - Kbely11121000Prodej bytu 3+kk 88 m²{'lat': 50.11815518747833, 'lon': 14.550433812...16840428280
102Praha 5 - Stodůlky11421000Prodej bytu 2+kk 74 m²{'lat': 50.03348418747833, 'lon': 14.323431812...3470633721
\n", + "

103 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " locality price name \\\n", + "0 Praha 9 - Kbely 12862000 Prodej bytu 4+kk 128 m² \n", + "1 Praha 2 - Vinohrady 21566000 Prodej bytu 2+kk 126 m² \n", + "2 Praha 5 - Sobín 17382000 Prodej bytu 3+kk 97 m² \n", + "3 Praha 5 - Stodůlky 18286000 Prodej bytu 4+kk 122 m² \n", + "4 Praha 5 - Stodůlky 14140000 Prodej bytu 3+kk 88 m² \n", + ".. ... ... ... \n", + "98 Praha 8 - Karlín 10236000 Prodej bytu 1+kk 60 m² \n", + "99 Praha 4 - Michle 29614000 Prodej bytu 3+kk 272 m² \n", + "100 Praha 4 - Modřany 14018000 Prodej bytu 3+kk 100 m² \n", + "101 Praha 9 - Kbely 11121000 Prodej bytu 3+kk 88 m² \n", + "102 Praha 5 - Stodůlky 11421000 Prodej bytu 2+kk 74 m² \n", + "\n", + " gps hash_id \\\n", + "0 {'lat': 50.12603618747833, 'lon': 14.561554812... 58234188 \n", + "1 {'lat': 50.06495918747833, 'lon': 14.454340812... 4107789900 \n", + "2 {'lat': 50.052054187478326, 'lon': 14.28598081... 1972872524 \n", + "3 {'lat': 50.02775118747833, 'lon': 14.324684812... 866350924 \n", + "4 {'lat': 50.02775118747833, 'lon': 14.324684812... 3735254860 \n", + ".. ... ... \n", + "98 {'lat': 50.08081318747833, 'lon': 14.459052812... 1918002252 \n", + "99 {'lat': 50.03685218747833, 'lon': 14.467224812... 2810619212 \n", + "100 {'lat': 49.989115187478326, 'lon': 14.41775681... 1567020876 \n", + "101 {'lat': 50.11815518747833, 'lon': 14.550433812... 1684042828 \n", + "102 {'lat': 50.03348418747833, 'lon': 14.323431812... 347063372 \n", + "\n", + " exclusively_at_rk \n", + "0 0 \n", + "1 0 \n", + "2 1 \n", + "3 0 \n", + "4 0 \n", + ".. ... \n", + "98 0 \n", + "99 1 \n", + "100 0 \n", + "101 0 \n", + "102 1 \n", + "\n", + "[103 rows x 6 columns]" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean = df[['locality', 'price', 'name', 'gps','hash_id','exclusively_at_rk']].copy()\n", + "clean" + ] + }, + { + "cell_type": "markdown", + "id": "80deec04-4959-4d9a-8a3a-7cf616e8558a", + "metadata": { + "tags": [] + }, + "source": [ + "### 2b: GPS\n", + "* Convert dictionary in `gps` column into two columns - `lat` and `lon`\n", + "* use apply function on gps column\n", + "* Note apply can return multiple columns" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "68f281f1-5169-47f6-a89d-ed1d9f416a48", + "metadata": {}, + "outputs": [], + "source": [ + "clean[['lat', 'lon']] = clean.gps.apply(lambda x: pd.Series({'lat': x['lat'], 'lon': x['lon']}))" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "fbd73a3c-83d5-4b74-8232-58ed33ee1edc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
localitypricenamegpshash_idexclusively_at_rklat1lon1latlon
0Praha 9 - Kbely12862000Prodej bytu 4+kk 128 m²{'lat': 50.12603618747833, 'lon': 14.561554812...58234188050.12603614.56155550.12603614.561555
1Praha 2 - Vinohrady21566000Prodej bytu 2+kk 126 m²{'lat': 50.06495918747833, 'lon': 14.454340812...4107789900050.06495914.45434150.06495914.454341
2Praha 5 - Sobín17382000Prodej bytu 3+kk 97 m²{'lat': 50.052054187478326, 'lon': 14.28598081...1972872524150.05205414.28598150.05205414.285981
3Praha 5 - Stodůlky18286000Prodej bytu 4+kk 122 m²{'lat': 50.02775118747833, 'lon': 14.324684812...866350924050.02775114.32468550.02775114.324685
4Praha 5 - Stodůlky14140000Prodej bytu 3+kk 88 m²{'lat': 50.02775118747833, 'lon': 14.324684812...3735254860050.02775114.32468550.02775114.324685
.................................
98Praha 8 - Karlín10236000Prodej bytu 1+kk 60 m²{'lat': 50.08081318747833, 'lon': 14.459052812...1918002252050.08081314.45905350.08081314.459053
99Praha 4 - Michle29614000Prodej bytu 3+kk 272 m²{'lat': 50.03685218747833, 'lon': 14.467224812...2810619212150.03685214.46722550.03685214.467225
100Praha 4 - Modřany14018000Prodej bytu 3+kk 100 m²{'lat': 49.989115187478326, 'lon': 14.41775681...1567020876049.98911514.41775749.98911514.417757
101Praha 9 - Kbely11121000Prodej bytu 3+kk 88 m²{'lat': 50.11815518747833, 'lon': 14.550433812...1684042828050.11815514.55043450.11815514.550434
102Praha 5 - Stodůlky11421000Prodej bytu 2+kk 74 m²{'lat': 50.03348418747833, 'lon': 14.323431812...347063372150.03348414.32343250.03348414.323432
\n", + "

103 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " locality price name \\\n", + "0 Praha 9 - Kbely 12862000 Prodej bytu 4+kk 128 m² \n", + "1 Praha 2 - Vinohrady 21566000 Prodej bytu 2+kk 126 m² \n", + "2 Praha 5 - Sobín 17382000 Prodej bytu 3+kk 97 m² \n", + "3 Praha 5 - Stodůlky 18286000 Prodej bytu 4+kk 122 m² \n", + "4 Praha 5 - Stodůlky 14140000 Prodej bytu 3+kk 88 m² \n", + ".. ... ... ... \n", + "98 Praha 8 - Karlín 10236000 Prodej bytu 1+kk 60 m² \n", + "99 Praha 4 - Michle 29614000 Prodej bytu 3+kk 272 m² \n", + "100 Praha 4 - Modřany 14018000 Prodej bytu 3+kk 100 m² \n", + "101 Praha 9 - Kbely 11121000 Prodej bytu 3+kk 88 m² \n", + "102 Praha 5 - Stodůlky 11421000 Prodej bytu 2+kk 74 m² \n", + "\n", + " gps hash_id \\\n", + "0 {'lat': 50.12603618747833, 'lon': 14.561554812... 58234188 \n", + "1 {'lat': 50.06495918747833, 'lon': 14.454340812... 4107789900 \n", + "2 {'lat': 50.052054187478326, 'lon': 14.28598081... 1972872524 \n", + "3 {'lat': 50.02775118747833, 'lon': 14.324684812... 866350924 \n", + "4 {'lat': 50.02775118747833, 'lon': 14.324684812... 3735254860 \n", + ".. ... ... \n", + "98 {'lat': 50.08081318747833, 'lon': 14.459052812... 1918002252 \n", + "99 {'lat': 50.03685218747833, 'lon': 14.467224812... 2810619212 \n", + "100 {'lat': 49.989115187478326, 'lon': 14.41775681... 1567020876 \n", + "101 {'lat': 50.11815518747833, 'lon': 14.550433812... 1684042828 \n", + "102 {'lat': 50.03348418747833, 'lon': 14.323431812... 347063372 \n", + "\n", + " exclusively_at_rk lat1 lon1 lat lon \n", + "0 0 50.126036 14.561555 50.126036 14.561555 \n", + "1 0 50.064959 14.454341 50.064959 14.454341 \n", + "2 1 50.052054 14.285981 50.052054 14.285981 \n", + "3 0 50.027751 14.324685 50.027751 14.324685 \n", + "4 0 50.027751 14.324685 50.027751 14.324685 \n", + ".. ... ... ... ... ... \n", + "98 0 50.080813 14.459053 50.080813 14.459053 \n", + "99 1 50.036852 14.467225 50.036852 14.467225 \n", + "100 0 49.989115 14.417757 49.989115 14.417757 \n", + "101 0 50.118155 14.550434 50.118155 14.550434 \n", + "102 1 50.033484 14.323432 50.033484 14.323432 \n", + "\n", + "[103 rows x 10 columns]" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean" + ] + }, + { + "cell_type": "markdown", + "id": "36c22408-c327-4c17-b1b4-de54f63f0627", + "metadata": {}, + "source": [ + "### 2b. Get flat type from name\n", + "* Name is always represented by string `Prodej bytu [type of flat] [Area] m^2`\n", + "* try picking third word in string\n", + "* check meaningfulness using `.value_counts()`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "150a551c-f321-408d-b8bc-dee6c2fb2adf", + "metadata": {}, + "outputs": [], + "source": [ + "clean['flat_type'] = clean.name.apply(lambda nm:nm.split()[2])" + ] + }, + { + "cell_type": "markdown", + "id": "5e233b14-db62-41f8-be82-45c861d62e3e", + "metadata": {}, + "source": [ + "### 2c. Get area from name\n", + "* Naive: select the word before last word\n", + "* Then try navigating using the index of `'m²'`\n", + "* if this also fail, then you will need to use regex" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "94a6f0cf-2c35-42fa-a518-d1249487da1e", + "metadata": {}, + "outputs": [], + "source": [ + "clean['area_1'] = clean.name.apply(lambda nm:nm.split()[3])" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "acfe4582-583d-42b9-acf8-72ccf97cfad5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Prodej', 'bytu', '4+kk', '128', 'm²']\n" + ] + }, + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n = 'Prodej bytu 4+kk 128 m²'\n", + "splited = n.split()\n", + "print(splited)\n", + "splited.index('m²')" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "02903e8f-5ef2-4cc6-bdcc-fb68d88ea3ba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "128" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "int(splited[3])" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "id": "150ff188-b6d0-4326-95c3-cfc30e6fcb03", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
localitypricenamegpshash_idexclusively_at_rklat1lon1latlonflat_typeareaarea_1area_2
0Praha 9 - Kbely12862000Prodej bytu 4+kk 128 m²{'lat': 50.12603618747833, 'lon': 14.561554812...58234188050.12603614.56155550.12603614.561555[Prodej, bytu, 4+kk, 128, m²]128128128
1Praha 2 - Vinohrady21566000Prodej bytu 2+kk 126 m²{'lat': 50.06495918747833, 'lon': 14.454340812...4107789900050.06495914.45434150.06495914.454341[Prodej, bytu, 2+kk, 126, m²]126126126
2Praha 5 - Sobín17382000Prodej bytu 3+kk 97 m²{'lat': 50.052054187478326, 'lon': 14.28598081...1972872524150.05205414.28598150.05205414.285981[Prodej, bytu, 3+kk, 97, m²]979797
3Praha 5 - Stodůlky18286000Prodej bytu 4+kk 122 m²{'lat': 50.02775118747833, 'lon': 14.324684812...866350924050.02775114.32468550.02775114.324685[Prodej, bytu, 4+kk, 122, m²]122122122
4Praha 5 - Stodůlky14140000Prodej bytu 3+kk 88 m²{'lat': 50.02775118747833, 'lon': 14.324684812...3735254860050.02775114.32468550.02775114.324685[Prodej, bytu, 3+kk, 88, m²]888888
.............................................
98Praha 8 - Karlín10236000Prodej bytu 1+kk 60 m²{'lat': 50.08081318747833, 'lon': 14.459052812...1918002252050.08081314.45905350.08081314.459053[Prodej, bytu, 1+kk, 60, m²]606060
99Praha 4 - Michle29614000Prodej bytu 3+kk 272 m²{'lat': 50.03685218747833, 'lon': 14.467224812...2810619212150.03685214.46722550.03685214.467225[Prodej, bytu, 3+kk, 272, m²]272272272
100Praha 4 - Modřany14018000Prodej bytu 3+kk 100 m²{'lat': 49.989115187478326, 'lon': 14.41775681...1567020876049.98911514.41775749.98911514.417757[Prodej, bytu, 3+kk, 100, m²]100100100
101Praha 9 - Kbely11121000Prodej bytu 3+kk 88 m²{'lat': 50.11815518747833, 'lon': 14.550433812...1684042828050.11815514.55043450.11815514.550434[Prodej, bytu, 3+kk, 88, m²]888888
102Praha 5 - Stodůlky11421000Prodej bytu 2+kk 74 m²{'lat': 50.03348418747833, 'lon': 14.323431812...347063372150.03348414.32343250.03348414.323432[Prodej, bytu, 2+kk, 74, m²]747474
\n", + "

103 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " locality price name \\\n", + "0 Praha 9 - Kbely 12862000 Prodej bytu 4+kk 128 m² \n", + "1 Praha 2 - Vinohrady 21566000 Prodej bytu 2+kk 126 m² \n", + "2 Praha 5 - Sobín 17382000 Prodej bytu 3+kk 97 m² \n", + "3 Praha 5 - Stodůlky 18286000 Prodej bytu 4+kk 122 m² \n", + "4 Praha 5 - Stodůlky 14140000 Prodej bytu 3+kk 88 m² \n", + ".. ... ... ... \n", + "98 Praha 8 - Karlín 10236000 Prodej bytu 1+kk 60 m² \n", + "99 Praha 4 - Michle 29614000 Prodej bytu 3+kk 272 m² \n", + "100 Praha 4 - Modřany 14018000 Prodej bytu 3+kk 100 m² \n", + "101 Praha 9 - Kbely 11121000 Prodej bytu 3+kk 88 m² \n", + "102 Praha 5 - Stodůlky 11421000 Prodej bytu 2+kk 74 m² \n", + "\n", + " gps hash_id \\\n", + "0 {'lat': 50.12603618747833, 'lon': 14.561554812... 58234188 \n", + "1 {'lat': 50.06495918747833, 'lon': 14.454340812... 4107789900 \n", + "2 {'lat': 50.052054187478326, 'lon': 14.28598081... 1972872524 \n", + "3 {'lat': 50.02775118747833, 'lon': 14.324684812... 866350924 \n", + "4 {'lat': 50.02775118747833, 'lon': 14.324684812... 3735254860 \n", + ".. ... ... \n", + "98 {'lat': 50.08081318747833, 'lon': 14.459052812... 1918002252 \n", + "99 {'lat': 50.03685218747833, 'lon': 14.467224812... 2810619212 \n", + "100 {'lat': 49.989115187478326, 'lon': 14.41775681... 1567020876 \n", + "101 {'lat': 50.11815518747833, 'lon': 14.550433812... 1684042828 \n", + "102 {'lat': 50.03348418747833, 'lon': 14.323431812... 347063372 \n", + "\n", + " exclusively_at_rk lat1 lon1 lat lon \\\n", + "0 0 50.126036 14.561555 50.126036 14.561555 \n", + "1 0 50.064959 14.454341 50.064959 14.454341 \n", + "2 1 50.052054 14.285981 50.052054 14.285981 \n", + "3 0 50.027751 14.324685 50.027751 14.324685 \n", + "4 0 50.027751 14.324685 50.027751 14.324685 \n", + ".. ... ... ... ... ... \n", + "98 0 50.080813 14.459053 50.080813 14.459053 \n", + "99 1 50.036852 14.467225 50.036852 14.467225 \n", + "100 0 49.989115 14.417757 49.989115 14.417757 \n", + "101 0 50.118155 14.550434 50.118155 14.550434 \n", + "102 1 50.033484 14.323432 50.033484 14.323432 \n", + "\n", + " flat_type area area_1 area_2 \n", + "0 [Prodej, bytu, 4+kk, 128, m²] 128 128 128 \n", + "1 [Prodej, bytu, 2+kk, 126, m²] 126 126 126 \n", + "2 [Prodej, bytu, 3+kk, 97, m²] 97 97 97 \n", + "3 [Prodej, bytu, 4+kk, 122, m²] 122 122 122 \n", + "4 [Prodej, bytu, 3+kk, 88, m²] 88 88 88 \n", + ".. ... ... ... ... \n", + "98 [Prodej, bytu, 1+kk, 60, m²] 60 60 60 \n", + "99 [Prodej, bytu, 3+kk, 272, m²] 272 272 272 \n", + "100 [Prodej, bytu, 3+kk, 100, m²] 100 100 100 \n", + "101 [Prodej, bytu, 3+kk, 88, m²] 88 88 88 \n", + "102 [Prodej, bytu, 2+kk, 74, m²] 74 74 74 \n", + "\n", + "[103 rows x 14 columns]" + ] + }, + "execution_count": 111, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def name_to_area(nm):\n", + " splitted= nm.split()\n", + " m2_idx = splitted.index('m²')\n", + " return int(splitted[m2_idx-1])\n", + "\n", + "clean['area_2'] = clean.name.apply(name_to_area)\n", + "clean" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "id": "3bc089d8-eab3-4e85-9f79-ce30291b456c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
localitypricenamegpshash_idexclusively_at_rklat1lon1latlonflat_typeareaarea_1area_2
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [locality, price, name, gps, hash_id, exclusively_at_rk, lat1, lon1, lat, lon, flat_type, area, area_1, area_2]\n", + "Index: []" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean[clean['area_1']==clean['area_2']]" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "id": "fbc4fd99-9dd9-43d0-9e4b-e49e48556534", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
localitypricenamegpshash_idexclusively_at_rklat1lon1latlonflat_typeareaarea_1area_2
0Praha 9 - Kbely12862000Prodej bytu 4+kk 128 m²{'lat': 50.12603618747833, 'lon': 14.561554812...58234188050.12603614.56155550.12603614.561555[Prodej, bytu, 4+kk, 128, m²]128128128
1Praha 2 - Vinohrady21566000Prodej bytu 2+kk 126 m²{'lat': 50.06495918747833, 'lon': 14.454340812...4107789900050.06495914.45434150.06495914.454341[Prodej, bytu, 2+kk, 126, m²]126126126
2Praha 5 - Sobín17382000Prodej bytu 3+kk 97 m²{'lat': 50.052054187478326, 'lon': 14.28598081...1972872524150.05205414.28598150.05205414.285981[Prodej, bytu, 3+kk, 97, m²]979797
3Praha 5 - Stodůlky18286000Prodej bytu 4+kk 122 m²{'lat': 50.02775118747833, 'lon': 14.324684812...866350924050.02775114.32468550.02775114.324685[Prodej, bytu, 4+kk, 122, m²]122122122
4Praha 5 - Stodůlky14140000Prodej bytu 3+kk 88 m²{'lat': 50.02775118747833, 'lon': 14.324684812...3735254860050.02775114.32468550.02775114.324685[Prodej, bytu, 3+kk, 88, m²]888888
.............................................
98Praha 8 - Karlín10236000Prodej bytu 1+kk 60 m²{'lat': 50.08081318747833, 'lon': 14.459052812...1918002252050.08081314.45905350.08081314.459053[Prodej, bytu, 1+kk, 60, m²]606060
99Praha 4 - Michle29614000Prodej bytu 3+kk 272 m²{'lat': 50.03685218747833, 'lon': 14.467224812...2810619212150.03685214.46722550.03685214.467225[Prodej, bytu, 3+kk, 272, m²]272272272
100Praha 4 - Modřany14018000Prodej bytu 3+kk 100 m²{'lat': 49.989115187478326, 'lon': 14.41775681...1567020876049.98911514.41775749.98911514.417757[Prodej, bytu, 3+kk, 100, m²]100100100
101Praha 9 - Kbely11121000Prodej bytu 3+kk 88 m²{'lat': 50.11815518747833, 'lon': 14.550433812...1684042828050.11815514.55043450.11815514.550434[Prodej, bytu, 3+kk, 88, m²]888888
102Praha 5 - Stodůlky11421000Prodej bytu 2+kk 74 m²{'lat': 50.03348418747833, 'lon': 14.323431812...347063372150.03348414.32343250.03348414.323432[Prodej, bytu, 2+kk, 74, m²]747474
\n", + "

103 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " locality price name \\\n", + "0 Praha 9 - Kbely 12862000 Prodej bytu 4+kk 128 m² \n", + "1 Praha 2 - Vinohrady 21566000 Prodej bytu 2+kk 126 m² \n", + "2 Praha 5 - Sobín 17382000 Prodej bytu 3+kk 97 m² \n", + "3 Praha 5 - Stodůlky 18286000 Prodej bytu 4+kk 122 m² \n", + "4 Praha 5 - Stodůlky 14140000 Prodej bytu 3+kk 88 m² \n", + ".. ... ... ... \n", + "98 Praha 8 - Karlín 10236000 Prodej bytu 1+kk 60 m² \n", + "99 Praha 4 - Michle 29614000 Prodej bytu 3+kk 272 m² \n", + "100 Praha 4 - Modřany 14018000 Prodej bytu 3+kk 100 m² \n", + "101 Praha 9 - Kbely 11121000 Prodej bytu 3+kk 88 m² \n", + "102 Praha 5 - Stodůlky 11421000 Prodej bytu 2+kk 74 m² \n", + "\n", + " gps hash_id \\\n", + "0 {'lat': 50.12603618747833, 'lon': 14.561554812... 58234188 \n", + "1 {'lat': 50.06495918747833, 'lon': 14.454340812... 4107789900 \n", + "2 {'lat': 50.052054187478326, 'lon': 14.28598081... 1972872524 \n", + "3 {'lat': 50.02775118747833, 'lon': 14.324684812... 866350924 \n", + "4 {'lat': 50.02775118747833, 'lon': 14.324684812... 3735254860 \n", + ".. ... ... \n", + "98 {'lat': 50.08081318747833, 'lon': 14.459052812... 1918002252 \n", + "99 {'lat': 50.03685218747833, 'lon': 14.467224812... 2810619212 \n", + "100 {'lat': 49.989115187478326, 'lon': 14.41775681... 1567020876 \n", + "101 {'lat': 50.11815518747833, 'lon': 14.550433812... 1684042828 \n", + "102 {'lat': 50.03348418747833, 'lon': 14.323431812... 347063372 \n", + "\n", + " exclusively_at_rk lat1 lon1 lat lon \\\n", + "0 0 50.126036 14.561555 50.126036 14.561555 \n", + "1 0 50.064959 14.454341 50.064959 14.454341 \n", + "2 1 50.052054 14.285981 50.052054 14.285981 \n", + "3 0 50.027751 14.324685 50.027751 14.324685 \n", + "4 0 50.027751 14.324685 50.027751 14.324685 \n", + ".. ... ... ... ... ... \n", + "98 0 50.080813 14.459053 50.080813 14.459053 \n", + "99 1 50.036852 14.467225 50.036852 14.467225 \n", + "100 0 49.989115 14.417757 49.989115 14.417757 \n", + "101 0 50.118155 14.550434 50.118155 14.550434 \n", + "102 1 50.033484 14.323432 50.033484 14.323432 \n", + "\n", + " flat_type area area_1 area_2 \n", + "0 [Prodej, bytu, 4+kk, 128, m²] 128 128 128 \n", + "1 [Prodej, bytu, 2+kk, 126, m²] 126 126 126 \n", + "2 [Prodej, bytu, 3+kk, 97, m²] 97 97 97 \n", + "3 [Prodej, bytu, 4+kk, 122, m²] 122 122 122 \n", + "4 [Prodej, bytu, 3+kk, 88, m²] 88 88 88 \n", + ".. ... ... ... ... \n", + "98 [Prodej, bytu, 1+kk, 60, m²] 60 60 60 \n", + "99 [Prodej, bytu, 3+kk, 272, m²] 272 272 272 \n", + "100 [Prodej, bytu, 3+kk, 100, m²] 100 100 100 \n", + "101 [Prodej, bytu, 3+kk, 88, m²] 88 88 88 \n", + "102 [Prodej, bytu, 2+kk, 74, m²] 74 74 74 \n", + "\n", + "[103 rows x 14 columns]" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean[clean['area_1'].astype(int)==clean['area_2']]" + ] + }, + { + "cell_type": "markdown", + "id": "ce71f809-7a5a-487e-882a-6aa9c7124727", + "metadata": {}, + "source": [ + "## Bonus: Convert `labelsAll` into categorical variables\n", + "\n", + "### Task 4a. Get all possible label names\n", + "* deal with nested-list structure\n", + "* Hint: try to sum the whole column\n", + "* Needed to Iterate through all labels in all rows and " + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "id": "03f3d060-5967-48af-9789-cade7acb715b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['d', 'c']" + ] + }, + "execution_count": 141, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "['d'] + ['c']" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "id": "073e90c7-bec6-4b04-bba3-cf095cdc65f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['natural_attraction',\n", + " 'kindergarten',\n", + " 'tram',\n", + " 'movies',\n", + " 'cellar',\n", + " 'brick',\n", + " 'candy_shop',\n", + " 'train',\n", + " 'metro',\n", + " 'bus_public_transport',\n", + " 'playground',\n", + " 'personal',\n", + " 'tavern',\n", + " 'loggia',\n", + " 'elevator',\n", + " 'school',\n", + " 'small_shop',\n", + " 'parking_lots',\n", + " 'partly_furnished',\n", + " 'new_building',\n", + " 'vet',\n", + " 'theater',\n", + " 'balcony',\n", + " 'not_furnished',\n", + " 'shop',\n", + " 'medic',\n", + " 'post_office',\n", + " 'sightseeing',\n", + " 'restaurant',\n", + " 'in_construction',\n", + " 'atm',\n", + " 'sports',\n", + " 'garage',\n", + " 'drugstore']" + ] + }, + "execution_count": 152, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "possible_labels = list(set([i for sublist in raw.labelsAll.sum() for i in sublist]))\n", + "possible_labels" + ] + }, + { + "cell_type": "markdown", + "id": "db0b86aa-57b0-439d-a82f-1d8f962be7c2", + "metadata": {}, + "source": [ + "### 4b. Test existence of label `cellar` for offers\n", + "* again deal with nested list of list structure\n", + "* write generic function `test_existence_of_label(offer_labels,label)`" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "id": "a633c468-e096-46bf-a51e-0f30f11cca26", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 False\n", + "1 False\n", + "2 True\n", + "3 True\n", + "4 False\n", + "5 True\n", + "6 True\n", + "7 False\n", + "8 True\n", + "9 True\n", + "10 True\n", + "11 True\n", + "12 True\n", + "13 True\n", + "14 True\n", + "15 True\n", + "16 True\n", + "17 True\n", + "18 True\n", + "19 True\n", + "20 True\n", + "Name: labelsAll, dtype: bool" + ] + }, + "execution_count": 163, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def test_existence_of_label(offer_labels,label):\n", + " return 'cellar' in [item for sublist in offer_labels for item in sublist]\n", + "\n", + "raw.labelsAll.apply(lambda offer_labels: test_existence_of_label(offer_labels, 'cellar'))" + ] + }, + { + "cell_type": "markdown", + "id": "d5e22365-b2d8-4c57-a0cd-7297efb8b948", + "metadata": {}, + "source": [ + "### 4c. Test existence of all possible labels\n", + "* use apply returning series with all labels" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "id": "8165a5a4-a52c-453a-b3e9-39d868fe5501", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
natural_attractionkindergartentrammoviescellarbrickcandy_shoptrainmetrobus_public_transport...shopmedicpost_officesightseeingrestaurantin_constructionatmsportsgaragedrugstore
0FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
2TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
3TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
5TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
6TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
7FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
8TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
9TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
10TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
11TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
12TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
13TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
14TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
15TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
16TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
17TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
18TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
19TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
20TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
\n", + "

21 rows × 34 columns

\n", + "
" + ], + "text/plain": [ + " natural_attraction kindergarten tram movies cellar brick \\\n", + "0 False False False False False False \n", + "1 False False False False False False \n", + "2 True True True True True True \n", + "3 True True True True True True \n", + "4 False False False False False False \n", + "5 True True True True True True \n", + "6 True True True True True True \n", + "7 False False False False False False \n", + "8 True True True True True True \n", + "9 True True True True True True \n", + "10 True True True True True True \n", + "11 True True True True True True \n", + "12 True True True True True True \n", + "13 True True True True True True \n", + "14 True True True True True True \n", + "15 True True True True True True \n", + "16 True True True True True True \n", + "17 True True True True True True \n", + "18 True True True True True True \n", + "19 True True True True True True \n", + "20 True True True True True True \n", + "\n", + " candy_shop train metro bus_public_transport ... shop medic \\\n", + "0 False False False False ... False False \n", + "1 False False False False ... False False \n", + "2 True True True True ... True True \n", + "3 True True True True ... True True \n", + "4 False False False False ... False False \n", + "5 True True True True ... True True \n", + "6 True True True True ... True True \n", + "7 False False False False ... False False \n", + "8 True True True True ... True True \n", + "9 True True True True ... True True \n", + "10 True True True True ... True True \n", + "11 True True True True ... True True \n", + "12 True True True True ... True True \n", + "13 True True True True ... True True \n", + "14 True True True True ... True True \n", + "15 True True True True ... True True \n", + "16 True True True True ... True True \n", + "17 True True True True ... True True \n", + "18 True True True True ... True True \n", + "19 True True True True ... True True \n", + "20 True True True True ... True True \n", + "\n", + " post_office sightseeing restaurant in_construction atm sports \\\n", + "0 False False False False False False \n", + "1 False False False False False False \n", + "2 True True True True True True \n", + "3 True True True True True True \n", + "4 False False False False False False \n", + "5 True True True True True True \n", + "6 True True True True True True \n", + "7 False False False False False False \n", + "8 True True True True True True \n", + "9 True True True True True True \n", + "10 True True True True True True \n", + "11 True True True True True True \n", + "12 True True True True True True \n", + "13 True True True True True True \n", + "14 True True True True True True \n", + "15 True True True True True True \n", + "16 True True True True True True \n", + "17 True True True True True True \n", + "18 True True True True True True \n", + "19 True True True True True True \n", + "20 True True True True True True \n", + "\n", + " garage drugstore \n", + "0 False False \n", + "1 False False \n", + "2 True True \n", + "3 True True \n", + "4 False False \n", + "5 True True \n", + "6 True True \n", + "7 False False \n", + "8 True True \n", + "9 True True \n", + "10 True True \n", + "11 True True \n", + "12 True True \n", + "13 True True \n", + "14 True True \n", + "15 True True \n", + "16 True True \n", + "17 True True \n", + "18 True True \n", + "19 True True \n", + "20 True True \n", + "\n", + "[21 rows x 34 columns]" + ] + }, + "execution_count": 164, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def existence_of_all_labels(offer_labels, possible_labels):\n", + " return pd.Series({\n", + " label:test_existence_of_label(offer_labels,label)\n", + " for label in possible_labels\n", + " })\n", + "\n", + "raw.labelsAll.apply(lambda offer_labels: existence_of_all_labels(offer_labels, possible_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8eceb6c0-9af6-4fb9-b178-f371dd453d39", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}