TST: fix ton of warnings (#627)

mie-lab · Jun 30, 2024 · a1b5e46 · a1b5e46
1 parent 983d27e
commit a1b5e46
Show file tree

Hide file tree

Showing 16 changed files with 62 additions and 58 deletions.
diff --git a/tests/analysis/test_tracking_quality.py b/tests/analysis/test_tracking_quality.py
@@ -188,11 +188,12 @@ def test_tracking_quality_user_error(self, testdata_sp_tpls_geolife_long):
         """Test if the an error is raised when passing unknown 'granularity' to _get_tracking_quality_user()."""
         sp_tpls = testdata_sp_tpls_geolife_long
         user_0 = sp_tpls.loc[sp_tpls["user_id"] == 0]
+        start_date = sp_tpls["started_at"].min().floor(freq="D")
 
         with pytest.raises(ValueError):
-            ti.analysis.tracking_quality._get_tracking_quality_user(user_0, granularity=12345)
+            ti.analysis.tracking_quality._get_tracking_quality_user(user_0, start_date, granularity=12345)
         with pytest.raises(ValueError):
-            ti.analysis.tracking_quality._get_tracking_quality_user(user_0, granularity="random")
+            ti.analysis.tracking_quality._get_tracking_quality_user(user_0, start_date, granularity="random")
 
     def test_staypoints_accessors(self, testdata_all_geolife_long):
         """Test tracking_quality calculation from staypoints accessor."""

diff --git a/tests/data/geolife_long/sp_tpls.csv b/tests/data/geolife_long/sp_tpls.csv
@@ -42,4 +42,4 @@ id,started_at,finished_at,user_id,type,is_activity,trip_id,prev_trip_id,next_tri
 20,2008-10-24 04:12:55+00:00,2008-10-24 05:28:05+00:00,1,staypoint,True,,11,12,2008-10-24 05:28:05+00:00,False,False
 20,2008-10-24 05:28:05+00:00,2008-10-24 05:39:50+00:00,1,tripleg,False,12,,,2008-10-24 05:39:53+00:00,True,False
 21,2008-10-24 05:39:53+00:00,2008-10-24 06:08:42+00:00,1,staypoint,True,,12,13,2008-10-24 06:08:42+00:00,False,False
-21,2008-10-24 06:08:42+00:00,2008-10-24 06:35:50+00:00,1,tripleg,False,13,,,,,False
+21,2008-10-24 06:08:42+00:00,2008-10-24 06:35:50+00:00,1,tripleg,False,13,,,,False,False
diff --git a/tests/data/trips/sp_tpls_gaps.csv b/tests/data/trips/sp_tpls_gaps.csv
@@ -29,4 +29,4 @@ id,started_at,finished_at,user_id,type,is_activity,trip_id,prev_trip_id,next_tri
 70,2010-01-13 20:40:00,2010-01-14 00:44:00,1,staypoint,True,,8.0,,2010-01-15 20:39:00,False,True
 126,2010-01-15 20:39:00,2010-01-15 20:40:00,1,tripleg,False,9.0,,,2010-01-15 20:44:00,False,False
 127,2010-01-15 20:44:00,2010-01-15 20:50:00,1,tripleg,False,9.0,,,2010-01-17 20:39:00,False,True
-128,2010-01-17 20:39:00,2010-01-17 20:40:00,1,tripleg,False,10.0,,,,,False
+128,2010-01-17 20:39:00,2010-01-17 20:40:00,1,tripleg,False,10.0,,,,False,False
diff --git a/tests/geogr/test_distances.py b/tests/geogr/test_distances.py
@@ -242,7 +242,6 @@ def test_known_euclidean_distance(self, two_pfs):
         pfs0, euc00, pfs1, euc01 = two_pfs
         res00 = calculate_distance_matrix(X=pfs0, dist_metric="euclidean")
         res01 = calculate_distance_matrix(X=pfs0, Y=pfs1, dist_metric="euclidean")
-        print(res00)
         assert np.all(euc00 == res00)
         assert np.all(euc01 == res01)
 

diff --git a/tests/geogr/test_filter.py b/tests/geogr/test_filter.py
@@ -18,8 +18,6 @@ def locs_from_geolife():
         method="dbscan", epsilon=10, num_samples=1, distance_metric="haversine", agg_level="dataset"
     )
 
-    # the projection needs to be defined: WGS84
-    locs.crs = "epsg:4326"
     return locs
 
 
@@ -79,7 +77,7 @@ def test_filter_triplegs(self):
     def test_filter_locations(self, locs_from_geolife):
         """Test if spatial_filter works for locations."""
         locs = locs_from_geolife
-        extent = gpd.read_file(os.path.join("tests", "data", "area", "tsinghua.geojson"), crs="epsg:4326")
+        extent = gpd.read_file(os.path.join("tests", "data", "area", "tsinghua.geojson"))
 
         # filter locations with the area
         within_loc = locs.spatial_filter(areas=extent, method="within", re_project=True)

diff --git a/tests/io/test_from_geopandas.py b/tests/io/test_from_geopandas.py
@@ -66,9 +66,9 @@ def test_setting_geometry(self, example_positionfixes):
     def test_set_crs(self, example_positionfixes):
         """Test if crs will be set."""
         pfs = example_positionfixes.copy()
-        example_positionfixes.crs = "EPSG:2056"
+        example_positionfixes = example_positionfixes.set_crs("EPSG:2056", allow_override=True)
         # check if the crs is correctly set
-        pfs.crs = None
+        pfs = pfs.set_crs(None, allow_override=True)
         pfs = _trackintel_model(pfs, crs="EPSG:2056")
         assert_geodataframe_equal(example_positionfixes, pfs)
 

diff --git a/tests/io/test_postgis.py b/tests/io/test_postgis.py
@@ -281,7 +281,7 @@ def test_no_crs(self, example_positionfixes, conn_postgis):
         table = "positionfixes"
         sql = f"SELECT * FROM {table}"
         geom_col = pfs.geometry.name
-        pfs.crs = None
+        pfs = pfs.set_crs(None, allow_override=True)
         no_crs_warning = "Could not parse CRS from the GeoDataFrame. Inserting data without defined CRS."
         try:
             with pytest.warns(UserWarning, match=no_crs_warning):
@@ -375,7 +375,7 @@ def test_no_crs(self, example_triplegs, conn_postgis):
         table = "triplegs"
         sql = f"SELECT * FROM {table}"
         geom_col = tpls.geometry.name
-        tpls.crs = None
+        tpls = tpls.set_crs(None, allow_override=True)
 
         no_crs_warning = "Could not parse CRS from the GeoDataFrame. Inserting data without defined CRS."
         try:
@@ -446,7 +446,7 @@ def test_no_crs(self, example_staypoints, conn_postgis):
         table = "staypoints"
         sql = f"SELECT * FROM {table}"
         geom_col = example_staypoints.geometry.name
-        sp.crs = None
+        sp = sp.set_crs(None, allow_override=True)
 
         no_crs_warning = "Could not parse CRS from the GeoDataFrame. Inserting data without defined CRS."
         try:
@@ -500,7 +500,7 @@ def test_no_crs(self, example_locations, conn_postgis):
         table = "locations"
         sql = f"SELECT * FROM {table}"
         geom_col = locs.geometry.name
-        locs.crs = None
+        locs = locs.set_crs(None, allow_override=True)
 
         no_crs_warning = "Could not parse CRS from the GeoDataFrame. Inserting data without defined CRS."
         try:
@@ -731,7 +731,7 @@ class TestGetSrid:
     def test_srid(self, example_positionfixes):
         """Test if `_get_srid` returns the correct srid."""
         gdf = example_positionfixes.copy()
-        gdf.crs = None
+        gdf = gdf.set_crs(None, allow_override=True)
         assert _get_srid(gdf) == -1
         srid = 3857
         gdf.set_crs(f"epsg:{srid}", inplace=True)

diff --git a/tests/preprocessing/test_positionfixes.py b/tests/preprocessing/test_positionfixes.py
@@ -74,10 +74,11 @@ def example_positionfixes_isolated():
         {"user_id": 2, "tracked_at": t2, "geometry": p2, "staypoint_id": pd.NA},
         {"user_id": 2, "tracked_at": t4, "geometry": p3, "staypoint_id": 5},
     ]
-    pfs = gpd.GeoDataFrame(data=list_dict, geometry="geometry", crs="EPSG:4326")
+    pfs = ti.Positionfixes(data=list_dict, geometry="geometry", crs="EPSG:4326")
+    pfs["staypoint_id"] = pfs["staypoint_id"].astype("Int64")
     pfs.index.name = "id"
 
-    return ti.Positionfixes(pfs)
+    return pfs
 
 
 class TestGenerate_staypoints:
@@ -201,9 +202,7 @@ def test_include_last(self):
         """Test if the include_last arguement will include the last pfs as stp."""
         pfs, _ = ti.io.dataset_reader.read_geolife(os.path.join("tests", "data", "geolife"))
 
-        pfs_wo, sp_wo = pfs.generate_staypoints(
-            method="sliding", dist_threshold=100, time_threshold=5.0, include_last=False
-        )
+        pfs_wo, sp_wo = pfs.generate_staypoints()
         pfs_include, sp_include = pfs.generate_staypoints(
             method="sliding", dist_threshold=100, time_threshold=5.0, include_last=True
         )

diff --git a/tests/preprocessing/test_staypoints.py b/tests/preprocessing/test_staypoints.py
@@ -178,7 +178,7 @@ def test_dbscan_hav_euc(self):
             method="dbscan", epsilon=100, num_samples=1, distance_metric="haversine", agg_level="dataset"
         )
         # WGS_1984
-        sp.crs = "epsg:4326"
+        sp = sp.set_crs("epsg:4326", allow_override=True)
         # WGS_1984_UTM_Zone_49N
         sp = sp.to_crs("epsg:32649")
 

diff --git a/tests/preprocessing/test_triplegs.py b/tests/preprocessing/test_triplegs.py
@@ -387,27 +387,27 @@ def test_crs(self, example_triplegs):
         """Test that the resulting GeoDataFrame has the correct crs or a warning or error is thrown if not set"""
         sp, tpls = example_triplegs
         # Case 1: sp crs None --> throw warning and set to tpls crs
-        sp.crs = None
+        sp = sp.set_crs(None, allow_override=True)
         with pytest.warns(UserWarning):
             _, _, trips = generate_trips(sp, tpls)
             assert trips.crs == tpls.crs
         # Case 2: Both crs None --> warn and set to None
-        tpls.crs = None
+        tpls = tpls.set_crs(None, allow_override=True)
         with pytest.warns(UserWarning):
             _, _, trips = generate_trips(sp, tpls)
             assert trips.crs is None
         # Case 3: tpls crs is None --> throw warning and set to sp crs
-        sp.crs = "EPSG:4326"
+        sp = sp.set_crs("EPSG:4326", allow_override=True)
         with pytest.warns(UserWarning):
             _, _, trips = generate_trips(sp, tpls)
             assert trips.crs == "EPSG:4326"
         # Case 4: Both crs set and correspond
-        tpls.crs = "EPSG:2056"
-        sp.crs = "EPSG:2056"
+        tpls = tpls.set_crs("EPSG:2056", allow_override=True)
+        sp = sp.set_crs("EPSG:2056", allow_override=True)
         _, _, trips = generate_trips(sp, tpls)
         assert trips.crs == "EPSG:2056"
         # Case 5: Both crs set but differ --> throw error
-        sp.crs = "EPSG:4326"
+        sp = sp.set_crs("EPSG:4326", allow_override=True)
         error_msg = "CRS of staypoints and triplegs differ. Geometry cannot be joined safely."
         with pytest.raises(AssertionError, match=error_msg):
             generate_trips(sp, tpls)
@@ -432,10 +432,9 @@ def _create_debug_sp_tpls_data(sp, tpls, gap_threshold):
     sp_tpls["is_activity"] = sp_tpls["is_activity"].__eq__(True)
     sp_tpls.sort_values(by=["user_id", "started_at"], inplace=True)
     sp_tpls["started_at_next"] = sp_tpls["started_at"].shift(-1)
-    sp_tpls["activity_next"] = sp_tpls["is_activity"].shift(-1)
+    sp_tpls["activity_next"] = sp_tpls["is_activity"].shift(-1, fill_value=False)
 
     sp_tpls["gap"] = (sp_tpls["started_at_next"] - sp_tpls["finished_at"]).dt.seconds / 60 > gap_threshold
-
     return sp_tpls
 
 
@@ -519,16 +518,17 @@ def _generate_trips_old(sp_input, tpls_input, gap_threshold=15, print_progress=F
     sp_tpls["started_at_next"] = sp_tpls["started_at"].shift(-1)
     sp_tpls["is_activity_next"] = sp_tpls["is_activity"].shift(-1)
 
+    cols = ["started_at", "finished_at", "user_id", "type", "is_activity", "id", "started_at_next", "is_activity_next"]
     if print_progress:
         tqdm.pandas(desc="User trip generation")
         trips = (
-            sp_tpls.groupby(["user_id"], group_keys=False, as_index=False)
+            sp_tpls.groupby(["user_id"], group_keys=False, as_index=False)[cols]
             .progress_apply(_generate_trips_user, gap_threshold=gap_threshold)
             .reset_index(drop=True)
         )
     else:
         trips = (
-            sp_tpls.groupby(["user_id"], group_keys=False, as_index=False)
+            sp_tpls.groupby(["user_id"], group_keys=False, as_index=False)[cols]
             .apply(_generate_trips_user, gap_threshold=gap_threshold)
             .reset_index(drop=True)
         )

diff --git a/trackintel/analysis/metrics.py b/trackintel/analysis/metrics.py
@@ -38,9 +38,9 @@ def radius_gyration(sp, method="count", print_progress=False):
 
     if print_progress:
         tqdm.pandas(desc="User radius of gyration calculation")
-        s = sp.groupby("user_id").progress_apply(_radius_gyration_user, method=method)
+        s = sp.groupby("user_id").progress_apply(_radius_gyration_user, method=method, include_groups=False)
     else:
-        s = sp.groupby("user_id").apply(_radius_gyration_user, method=method)
+        s = sp.groupby("user_id").apply(_radius_gyration_user, method=method, include_groups=False)
 
     s = s.rename("radius_gyration")
     return s

diff --git a/trackintel/analysis/tracking_quality.py b/trackintel/analysis/tracking_quality.py
@@ -76,7 +76,9 @@ def temporal_tracking_quality(source, granularity="all"):
         return None
 
     if granularity == "all":
-        quality = df.groupby("user_id", as_index=False).apply(_get_tracking_quality_user, granularity)
+        quality = df.groupby("user_id", as_index=False).apply(
+            _get_tracking_quality_user, granularity, include_groups=False
+        )
         return quality
 
     # split records that span several days
@@ -90,19 +92,11 @@ def temporal_tracking_quality(source, granularity="all"):
         column_name = "week_monday"
 
     elif granularity == "weekday":
-        # get the tracked week relative to the first day
-        start_date = df["started_at"].min().floor(freq="D")
-        df["week"] = ((df["started_at"] - start_date)).dt.days // 7
-
         grouper = df["started_at"].dt.weekday
         column_name = "weekday"
 
     elif granularity == "hour":
         df = _split_overlaps(df, granularity="hour")
-        # get the tracked day relative to the first day
-        start_date = df["started_at"].min().floor(freq="D")
-        df["day"] = (df["started_at"] - start_date).dt.days
-
         grouper = df["started_at"].dt.hour
         column_name = "hour"
 
@@ -111,8 +105,13 @@ def temporal_tracking_quality(source, granularity="all"):
             f"granularity unknown. We only support ['all', 'day', 'week', 'weekday', 'hour']. You passed {granularity}"
         )
 
+    start_date = df["started_at"].min().floor(freq="D")
     # calculate per-user per-grouper tracking quality
-    quality = df.groupby(["user_id", grouper]).apply(_get_tracking_quality_user, granularity).reset_index()
+    quality = (
+        df.groupby(["user_id", grouper])[["started_at", "finished_at"]]
+        .apply(_get_tracking_quality_user, start_date, granularity)
+        .reset_index()
+    )
 
     # rename and reorder
     quality.rename(columns={"started_at": column_name}, inplace=True)
@@ -121,7 +120,7 @@ def temporal_tracking_quality(source, granularity="all"):
     return quality
 
 
-def _get_tracking_quality_user(df, granularity="all"):
+def _get_tracking_quality_user(df, start_date, granularity="all"):
     """
     Tracking quality per-user per-granularity.
 
@@ -130,6 +129,9 @@ def _get_tracking_quality_user(df, granularity="all"):
     df : Trackintel class
         The source dataframe
 
+    start_date: pd.Timestamp
+        When measurement started, used to calculate in which weekday or week the measurement lies.
+
     granularity : {"all", "day", "weekday", "week", "hour"}, default "all"
         Determines the extent of the tracking. "all" the entire tracking period,
         "day" and "weekday" a whole day, "week" a whole week, and "hour" a whole hour.
@@ -149,13 +151,15 @@ def _get_tracking_quality_user(df, granularity="all"):
     elif granularity == "weekday":
         # total seconds in an day * number of tracked weeks
         # (entries from multiple weeks may be grouped together)
+        df["week"] = ((df["started_at"] - start_date)).dt.days // 7
         extent = 60 * 60 * 24 * (df["week"].max() - df["week"].min() + 1)
     elif granularity == "week":
         # total seconds in a week
         extent = 60 * 60 * 24 * 7
     elif granularity == "hour":
         # total seconds in an hour * number of tracked days
         # (entries from multiple days may be grouped together)
+        df["day"] = (df["started_at"] - start_date).dt.days
         extent = (60 * 60) * (df["day"].max() - df["day"].min() + 1)
     else:
         raise ValueError(

diff --git a/trackintel/geogr/distances.py b/trackintel/geogr/distances.py
@@ -358,7 +358,7 @@ def get_speed_triplegs(triplegs, positionfixes=None, method="tpls_speed"):
         if "tripleg_id" not in positionfixes:
             raise AttributeError('Positionfixes must include column "tripleg_id".')
         # group positionfixes by triplegs and compute average speed for each collection of positionfixes
-        grouped_pfs = positionfixes.groupby("tripleg_id").apply(_single_tripleg_mean_speed)
+        grouped_pfs = positionfixes.groupby("tripleg_id").apply(_single_tripleg_mean_speed, include_groups=False)
         # add the speed values to the triplegs column
         tpls = pd.merge(triplegs, grouped_pfs.rename("speed"), how="left", left_index=True, right_index=True)
         tpls.index = tpls.index.astype("int64")

diff --git a/trackintel/geogr/filter.py b/trackintel/geogr/filter.py
@@ -55,11 +55,11 @@ def spatial_filter(source, areas, method="within", re_project=False):
 
     # get final result
     if method == "within":
-        ret_gdf = possible_matches.loc[possible_matches.within(areas.unary_union)]
+        ret_gdf = possible_matches.loc[possible_matches.within(areas.union_all())]
     elif method == "intersects":
-        ret_gdf = possible_matches.loc[possible_matches.intersects(areas.unary_union)]
+        ret_gdf = possible_matches.loc[possible_matches.intersects(areas.union_all())]
     elif method == "crosses":
-        ret_gdf = possible_matches.loc[possible_matches.crosses(areas.unary_union)]
+        ret_gdf = possible_matches.loc[possible_matches.crosses(areas.union_all())]
     else:
         raise ValueError("method unknown. We only support ['within', 'intersects', 'crosses']. " f"You passed {method}")
 

diff --git a/trackintel/preprocessing/positionfixes.py b/trackintel/preprocessing/positionfixes.py
@@ -254,7 +254,7 @@ def generate_triplegs(
 
         # initialize the index list of pfs where a tpl will begin
         insert_index_ls = []
-        pfs["staypoint_id"] = pd.NA
+        pfs["staypoint_id"] = pd.Series(dtype="Int64")
 
         for user_id_this in pfs["user_id"].unique():
             sp_user = staypoints[staypoints["user_id"] == user_id_this]
@@ -282,7 +282,7 @@ def generate_triplegs(
 
     # initialize tripleg_id with pd.NA and fill all pfs that belong to staypoints with -1
     # pd.NA will be replaced later with tripleg ids
-    pfs["tripleg_id"] = pd.NA
+    pfs["tripleg_id"] = pd.Series(dtype="Int64")
     pfs.loc[~pd.isna(pfs["staypoint_id"]), "tripleg_id"] = -1
 
     # get all conditions that trigger a new tripleg.
@@ -437,7 +437,7 @@ def _generate_triplegs_overlap_staypoints(cond_temporal_gap, pfs, staypoints):
 
     # spatial overlap: overlap tripleg with the location of previous and next staypoint
     # geometry: tpl's share common start and end pfs with sp
-    cond_overlap_end = cond_overlap & ~cond_temporal_gap.shift(-1).fillna(False) & pd.isna(pfs["tripleg_id"])
+    cond_overlap_end = cond_overlap & ~cond_temporal_gap.shift(-1, fill_value=False) & pd.isna(pfs["tripleg_id"])
     pfs.loc[cond_overlap_end, "tripleg_id"] = between_tpls_ids.shift(-1)[cond_overlap_end]
     cond_empty = pd.isna(pfs["tripleg_id"])
     pfs.loc[cond_empty, "tripleg_id"] = between_tpls_ids[cond_empty]
@@ -524,7 +524,7 @@ def __create_new_staypoints(start, end, pfs, elevation_flag, geo_col, last_flag=
     # if end is the last pfs, we want to include the info from it as well
     if last_flag:
         end = len(pfs)
-    points = pfs[geo_col].iloc[start:end].unary_union
+    points = pfs[geo_col].iloc[start:end].union_all()
     if check_gdf_planar(pfs):
         new_sp[geo_col] = points.centroid
     else:

diff --git a/trackintel/preprocessing/triplegs.py b/trackintel/preprocessing/triplegs.py
@@ -133,7 +133,13 @@ def _seperate_ids(row):
     user_change[["type", "is_activity"]] = ["user_change", True]  # nicer for debugging
 
     # merge trips with (filler) activities
-    trips.drop(columns=["type", "sp_tpls_id"], inplace=True)  # make space so no overlap with activity "sp_tpls_id"
+
+    # make space so no overlap with activity "sp_tpls_id"
+    trips.drop(columns=["type", "sp_tpls_id"], inplace=True)
+
+    # trips are no activity (with this we don't have to fillna later)
+    trips["is_activity"] = False
+
     # Inserting `gaps` and `user_change` into the dataframe creates buffers that catch shifted
     # "staypoint_id" and "trip_id" from corrupting staypoints/trips.
     trips_with_act = pd.concat((trips, sp_tpls_only_act, gaps, user_change), axis=0, ignore_index=True)
@@ -153,8 +159,6 @@ def _seperate_ids(row):
     trips_with_act["prev_trip_id"] = trips_with_act["trip_id"].shift(1)
     trips_with_act["next_trip_id"] = trips_with_act["trip_id"].shift(-1)
 
-    # transform column to binary
-    trips_with_act["is_activity"] = trips_with_act["is_activity"].fillna(False)
     # delete activities
     trips = trips_with_act[~trips_with_act["is_activity"]].copy()
 
@@ -268,9 +272,8 @@ def _concat_staypoints_triplegs(staypoints, triplegs, add_geometry):
     sp["type"] = "staypoint"
 
     # create table with relevant information from triplegs and staypoints.
-    sp_cols = ["started_at", "finished_at", "user_id", "type", "is_activity"]
-    tpls_cols = ["started_at", "finished_at", "user_id", "type"]
-    sp_tpls = pd.concat([sp[sp_cols], tpls[tpls_cols]])
+    cols = ["started_at", "finished_at", "user_id", "type", "is_activity"]
+    sp_tpls = pd.concat([sp[cols], tpls[cols]])
     sp_tpls["is_activity"] = sp_tpls["is_activity"].fillna(False)
     sp_tpls["sp_tpls_id"] = sp_tpls.index  # store id for later reassignment
     if add_geometry: