# Source code for trackintel.preprocessing.trips

```
import warnings
from datetime import timedelta
import numpy as np
import pandas as pd
import trackintel as ti
from trackintel import Tours
from trackintel.preprocessing.util import applyParallel
[docs]
def get_trips_grouped(trips, tours):
"""Helper function to get grouped trips by tour id
Parameters
----------
trips: Trips
Trips dataframe
tours: Tours
Output of generate_tours function, must contain column "trips" with list of trip ids on tour
Returns
-------
trips_grouped_by_tour: DataFrameGroupBy object
Trips grouped by tour id
Examples
--------
>>> get_trips_grouped(trips, tours)
Notes
-------
This function is necessary because when running generate_tours, one trip only gets the tour ID of the smallest
tour it belongs to assigned. Here, we return all trips for each tour, which might contain a nested tour.
"""
trips_inp = trips.copy()
if "tour_id" in trips_inp.columns:
trips_inp.drop(columns=["tour_id"], inplace=True)
# make smaller version of tours
tours_to_trips = tours.reset_index()[["id", "trips"]]
# switch to trips id as index
tours_to_trips.rename(columns={"id": "tour_id", "trips": "trip_id"}, inplace=True)
# expand this small version so that each trip id is one row
tours_expanded = tours_to_trips.explode("trip_id").reset_index(drop=True)
# join with trips table by id
tours_with_trips = tours_expanded.merge(trips_inp, left_on="trip_id", right_on="id", how="left")
# group
trips_grouped_by_tour = tours_with_trips.groupby("tour_id")
return trips_grouped_by_tour
[docs]
def generate_tours(
trips,
staypoints=None,
max_dist=100,
max_time="1d",
max_nr_gaps=0,
print_progress=False,
n_jobs=1,
):
"""
Generate trackintel-tours from trips
Parameters
----------
trips : Trips
staypoints : Staypoints, default None
Must have `location_id` column to connect trips via locations to a tour.
If None, trips will be connected based only by the set distance threshold `max_dist`.
max_dist: float, default 100 (meters)
Maximum distance between the end point of one trip and the start point of the next trip within a tour.
This is parameter is only used if staypoints is `None`!
Also, if `max_nr_gaps > 0`, a tour can contain larger spatial gaps (see Notes below for more detail)
max_time: str or pd.Timedelta, default "1d" (1 day)
Maximum time that a tour is allowed to take
max_nr_gaps: int, default 0
Maximum number of spatial gaps on the tour. Use with caution - see notes below.
print_progress : bool, default False
If print_progress is True, the progress bar is displayed
n_jobs: int, default 1
The maximum number of concurrently running jobs. If -1 all CPUs are used. If 1 is given, no parallel
computing code is used at all, which is useful for debugging. See
https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation
for a detailed description
Returns
-------
trips_with_tours: Trips
Same as `trips`, but with column `tour_id`, containing a list of the tours that the trip is part of (see notes).
tours: Tours
The generated tours
Examples
--------
>>> trips.generate_tours(staypoints)
Notes
-------
- Tours are defined as a collection of trips in a certain time frame that start and end at the same point
- Tours and trips have an N:N relationship: One tour consists of multiple trips, but also one trip can be part of
multiple tours, due to nested tours or overlapping tours.
- This function implements two possibilities to generate tours of trips: Via the location ID in the `staypoints`
df, or via a maximum distance. Thus, note that only one of the parameters `staypoints` or `max_dist` is used!
- Nested tours are possible and will be regarded as 2 (or more tours).
- It is possible to allow spatial gaps to occur on the tour, which might be useful to deal with missing data.
Example: The two trips home-work, supermarket-home would still be detected as a tour when max_nr_gaps >= 1,
although the work-supermarket trip is missing.
Warning: This only counts the number of gaps, but neither temporal or spatial distance of gaps, nor the number
of missing trips in a gap are bounded. Thus, this parameter should be set with caution, because trips that are
hours apart might still be connected to a tour if `max_nr_gaps > 0`.
"""
# Two options: either the location IDs for staypoints on the trips are provided, or a maximum distance threshold
# between end and start of trips is used
if staypoints is not None:
assert (
"location_id" in staypoints.columns
), "Staypoints with location ID is required, otherwise tours are generated without location using max_dist"
geom_col = None # not used
crs_is_projected = False # not used
ti.Staypoints.validate(staypoints)
ti.TripsDataFrame.validate(trips)
else:
# if no location is given, we need the trips table to have a geometry column
ti.TripsGeoDataFrame.validate(trips)
geom_col = trips.geometry.name
# get crs
crs_is_projected = ti.geogr.check_gdf_planar(trips)
# convert max_time to timedelta
if isinstance(max_time, str):
max_time = pd.to_timedelta(max_time)
# otherwise check if it's a Timedelta already, and raise error if not
elif not isinstance(max_time, pd.Timedelta):
raise TypeError("Parameter max_time must be either of type String or pd.Timedelta!")
trips_input = trips.copy()
# If the trips already have a column "tour_id", we drop it
if "tour_id" in trips_input:
trips_input.drop(columns="tour_id", inplace=True)
warnings.warn("Deleted existing column 'tour_id' from trips.")
kwargs = {
"max_dist": max_dist,
"max_nr_gaps": max_nr_gaps,
"max_time": max_time,
"staypoints": staypoints,
"geom_col": geom_col,
"crs_is_projected": crs_is_projected,
}
tours = applyParallel(
trips_input.groupby("user_id", group_keys=False, as_index=False),
_generate_tours_user,
print_progress=print_progress,
n_jobs=n_jobs,
**kwargs
)
# No tours found
if len(tours) == 0:
warnings.warn("No tours can be generated, return empty tours")
return trips_input, tours
# index management
tours["id"] = np.arange(len(tours))
tours.set_index("id", inplace=True)
# assign tour id to trips
tour2trip_map = tours.reset_index().explode("trips").rename(columns={"id": "tour_id"})
# Each trip is only assigned to one tour. If a trip belongs to multiple tours, we can find its smallest subtour
# by using the first one it is assigned to (nested tours are always found before big tours - have smaller tour_id)
temp = tour2trip_map.groupby("trips").agg({"tour_id": list})
trips_with_tours = trips_input.join(temp, how="left")
# trips id (generated by this function) should be int64
tours.index = tours.index.astype("int64")
return trips_with_tours, Tours(tours)
def _generate_tours_user(
user_trip_df,
staypoints=None,
max_dist=100,
max_nr_gaps=0,
max_time=timedelta(days=1),
geom_col="geom",
crs_is_projected=False,
):
"""
Compute tours from trips for one user
Parameters
----------
user_trip_df : Trips
The trips have to follow the standard definition for trips DataFrames
staypoints : Staypoints, optional
Must contain location ID column to connect trips via locations to a tour.
If None, trips will be connected based only on a distance threshold `max_dist`.
max_dist: float, default 100 (meters)
Maximum distance between the end point of one trip and the start point of the next trip on a tour.
However, if `max_nr_gaps > 0`, a tour can contain larger spatial gaps (see notes in `generate_tours`)
max_time: Timedelta, default 1 day
Maximum time that a tour is allowed to take
max_nr_gaps: int, default 0
Maximum number of spatial gaps on the tour. Use with caution - see notes in `generate_tours`.
geom_col : str, optional
Name of geometry column of user_trip_df, by default "geom"
crs_is_projected : bool, optional
Whether the crs of user_trip_df is projected, by default False
Returns
-------
tours_df: DataFrame
Tours for one user
"""
user_id = user_trip_df["user_id"].unique()
assert len(user_id) == 1
user_id = user_id[0]
# sort by time
user_trip_df = user_trip_df.sort_values(by=["started_at"])
# save only the trip id (row.name) in the start candidates
start_candidates = []
# collect tours
tours = []
# Iterate over trips
for _, row in user_trip_df.iterrows():
end_time = row["finished_at"]
if len(start_candidates) > 0:
# Check if there is a spatial gap between the previous and current trip:
# If staypoints with locations are available, check whether they share the same location
if staypoints is not None:
end_start_at_same_loc = _check_same_loc(
user_trip_df.loc[start_candidates[-1], "destination_staypoint_id"], # dest. stp of previous trip
row["origin_staypoint_id"], # start stp of current trip
staypoints,
)
else:
# If no locations are available, check whether the distance is smaller than max_dist
end_start_at_same_loc = _check_max_dist(
user_trip_df.loc[start_candidates[-1], geom_col].geoms[1], # destination point of previous trip
row[geom_col].geoms[0], # start point of current trip
max_dist,
crs_is_projected,
)
# if the current trip does not start at the end of the previous trip, there is a gap
if not end_start_at_same_loc:
# option 1: no gaps allowed - start search again
if max_nr_gaps == 0:
start_candidates = [row.name]
continue
# option 2: gaps allowed - search further
else:
start_candidates.append(np.nan)
# Add this point as a candidate
start_candidates.append(row.name)
# Check whether endpoint would be an unknown activity
if pd.isna(row["destination_staypoint_id"]):
continue
# keep a list of which candidates to remove (because of time frame)
new_list_start = 0
# keep track of how many gaps we encountered, if greater than max_nr_gaps then stop
gap_counter = 0
# check for all candidates whether they form a tour with the current trip
for j, cand in enumerate(start_candidates[::-1]):
# gap
if np.isnan(cand):
gap_counter += 1
if gap_counter > max_nr_gaps:
# these gaps won't vanish, so we can crop the candidate list here
new_list_start = j + 1
break
else:
continue
# check time difference - if time too long, we can remove the candidate
cand_start_time = user_trip_df.loc[cand, "started_at"]
if end_time - cand_start_time > max_time:
new_list_start = len(start_candidates) - j - 1
break
# check whether the start-end candidate of a tour is an unknown activity
if pd.isna(user_trip_df.loc[cand, "origin_staypoint_id"]):
continue
# check if endpoint of trip = start location of cand
if staypoints is not None:
end_start_at_same_loc = _check_same_loc(
user_trip_df.loc[cand, "origin_staypoint_id"], # start stp of first trip
row["destination_staypoint_id"], # destination stp of current trip
staypoints,
)
else:
# if no locations are available, check whether the distance is smaller than max_dist
end_start_at_same_loc = _check_max_dist(
user_trip_df.loc[cand, geom_col].geoms[0], # start point of first trip
row[geom_col].geoms[1], # destination point of current trip
max_dist,
crs_is_projected=crs_is_projected,
)
if end_start_at_same_loc:
# Tour found!
# collect the trips on the tour in a list
non_gap_trip_idxs = [c for c in start_candidates[-j - 1 :] if ~np.isnan(c)]
tour_candidate = user_trip_df[user_trip_df.index.isin(non_gap_trip_idxs)]
tours.append(_create_tour_from_stack(tour_candidate, staypoints, max_time))
# do not consider the other trips - one trip cannot close two tours at a time
break
# remove points because they are out of the time window
start_candidates = start_candidates[new_list_start:]
if len(tours) == 0:
return pd.DataFrame(
tours,
columns=[
"user_id",
"started_at",
"finished_at",
"origin_staypoint_id",
"destination_staypoint_id",
"trips",
"location_id",
],
)
tours_df = pd.DataFrame(tours)
return tours_df
def _check_same_loc(stp1, stp2, staypoints):
"""Check whether two staypoints are at the same location
Parameters
----------
stp1 : int
First staypoint id
stp2 : int
Second staypoint id
staypoints : Trackintel staypoints
GeoDataFrame with staypoints and also location ids
Returns
-------
share_location, bool
If True, stp1 and stp2 are at the same location
"""
if pd.isna(stp1) or pd.isna(stp2):
return False
share_location = staypoints.loc[stp1, "location_id"] == staypoints.loc[stp2, "location_id"]
return share_location
def _check_max_dist(p1, p2, max_dist, crs_is_projected=False):
"""
Check whether two points p1, p2 are less or equal than max_dist apart
Parameters
--------
p1, p2: shapely Point objects
max_dist: int
Returns
------
dist_below_thresh: bool
indicating whether p1 and p2 are less than max_dist apart
"""
if crs_is_projected:
dist = p1.distance(p2)
else:
dist = ti.geogr.point_haversine_dist(p1.x, p1.y, p2.x, p2.y)
dist_below_thresh = dist <= max_dist
return dist_below_thresh
def _create_tour_from_stack(temp_tour_stack, staypoints, max_time):
"""
Aggregate information of tour elements in a structured dictionary.
Parameters
----------
temp_tour_stack : list
list of dictionary like elements (either pandas series or python dictionary).
Contains all trips that will be aggregated into a tour
Returns
-------
tour_dict_entry: dictionary
"""
# this function return and empty dict if no tripleg is in the stack
first_trip = temp_tour_stack.iloc[0]
last_trip = temp_tour_stack.iloc[-1]
# get location ID if available:
if staypoints is not None:
start_loc = staypoints.loc[first_trip["origin_staypoint_id"], "location_id"]
# double check whether start and end location are the same
end_loc = staypoints.loc[last_trip["destination_staypoint_id"], "location_id"]
assert start_loc == end_loc
else:
# set location to NaN since not available
start_loc = pd.NA
# all data has to be from the same user
assert len(temp_tour_stack["user_id"].unique()) == 1
# double check if tour requirements are fulfilled
assert last_trip["finished_at"] - first_trip["started_at"] <= max_time
tour_dict_entry = {
"user_id": first_trip["user_id"],
"started_at": first_trip["started_at"],
"finished_at": last_trip["finished_at"],
"origin_staypoint_id": first_trip["origin_staypoint_id"],
"destination_staypoint_id": last_trip["destination_staypoint_id"],
"trips": list(temp_tour_stack.index),
"location_id": start_loc,
}
return tour_dict_entry
```