diff --git a/doc/_static/home/large_data.gif b/doc/_static/home/large_data.gif index d232352ed..6eb1bf3a4 100644 Binary files a/doc/_static/home/large_data.gif and b/doc/_static/home/large_data.gif differ diff --git a/doc/index.md b/doc/index.md index a9763c788..553dc97a1 100644 --- a/doc/index.md +++ b/doc/index.md @@ -263,10 +263,11 @@ align: center :::{tab-item} Layout ```python import hvplot.pandas -from hvplot.sample_data import us_crime as df +from hvplot.sampledata import stocks -plot1 = df.hvplot(x='Year', y='Violent Crime rate', width=400) -plot2 = df.hvplot(x='Year', y='Burglary rate', width=400) +df = stocks('pandas') +plot1 = df.hvplot(x='date', y='Apple', width=400) +plot2 = df.hvplot(x='date', y='Google', width=400) plot1 + plot2 ``` ```{image} ./_static/home/layout.gif @@ -312,10 +313,10 @@ align: center :::{tab-item} Large Data ```python import hvplot.pandas -from hvplot.sample_data import catalogue as cat +from hvplot.sampledata import synthetic_clusters -df = cat.airline_flights.read() -df.hvplot.scatter(x='distance', y='airtime', rasterize=True, cnorm='eq_hist', width=500) +df = synthetic_clusters('pandas') +df.hvplot.points(datashade=True, by='cat', width=500) ``` ```{image} ./_static/home/large_data.gif --- diff --git a/doc/user_guide/Plotting.ipynb b/doc/user_guide/Plotting.ipynb index aa452700a..d9495950d 100644 --- a/doc/user_guide/Plotting.ipynb +++ b/doc/user_guide/Plotting.ipynb @@ -39,7 +39,6 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", "import hvplot.pandas # noqa\n", "import hvplot.dask # noqa" ] @@ -54,10 +53,10 @@ "\n", "We will be focusing on two different datasets:\n", "\n", - "- A small CSV file of US crime data, broken down by state\n", - "- A larger Parquet-format file of airline flight data\n", + "- A small CSV file of Apple Inc. (AAPL) daily stock prices\n", + "- A larger synthetic dataset of points organised into 5 clusters\n", "\n", - "The ``hvplot.sample_data`` module makes these datasets Intake data catalogue, which we can load either using pandas:" + "The ``hvplot.sampledata`` module provides access to both datasets via the ``hvsampledata`` package, which we can load either using pandas:" ] }, { @@ -66,11 +65,11 @@ "metadata": {}, "outputs": [], "source": [ - "from hvplot.sample_data import us_crime, airline_flights\n", + "from hvplot.sampledata import apple_stocks, synthetic_clusters, penguins\n", "\n", - "crime = us_crime.read()\n", - "print(type(crime))\n", - "crime.head()" + "stocks = apple_stocks('pandas')\n", + "print(type(stocks))\n", + "stocks.head()" ] }, { @@ -86,9 +85,9 @@ "metadata": {}, "outputs": [], "source": [ - "flights = airline_flights.to_dask().persist()\n", - "print(type(flights))\n", - "flights.head()" + "clusters = synthetic_clusters('dask', lazy=True)\n", + "print(type(clusters))\n", + "clusters.head()" ] }, { @@ -102,7 +101,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The ``dask.dataframe.DataFrame.hvplot``, ``pandas.DataFrame.hvplot`` and ``intake.DataSource.plot`` interfaces (and Series equivalents) from HvPlot provide a powerful high-level API to generate complex plots. The ``.hvplot`` API can be called directly or used as a namespace to generate specific plot types." + "The ``dask.dataframe.DataFrame.hvplot`` and ``pandas.DataFrame.hvplot`` interfaces (and Series equivalents) from HvPlot provide a powerful high-level API to generate complex plots. The ``.hvplot`` API can be called directly or used as a namespace to generate specific plot types." ] }, { @@ -125,7 +124,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.line(x='Year', y='Violent Crime rate')" + "stocks.hvplot.line(x='date', y='close')" ] }, { @@ -141,14 +140,14 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot(x='Year', y='Violent Crime rate', kind='scatter')" + "stocks.hvplot(x='date', y='close', kind='scatter')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "To group the data by one or more additional columns, specify an additional ``by`` variable. As an example here we will plot the departure delay ('depdelay') as a function of 'distance', grouping the data by the 'carrier'. There are many available carriers, so we will select only two of them so that the plot is readable:" + "To group the data by one or more additional columns, specify an additional ``by`` variable. As an example here we will plot the y coordinate as a function of x, grouping the data by cluster category ('cat'). There are 5 clusters available, so we will select only two of them so that the plot is readable:" ] }, { @@ -157,8 +156,8 @@ "metadata": {}, "outputs": [], "source": [ - "flight_subset = flights[flights.carrier.isin(['OH', 'F9'])]\n", - "flight_subset.hvplot(x='distance', y='depdelay', by='carrier', kind='scatter', alpha=0.2, persist=True)" + "cluster_subset = clusters[clusters.cat.isin(['d1', 'd2'])].compute()\n", + "cluster_subset.hvplot(x='x', y='y', by='cat', kind='scatter', alpha=0.2)" ] }, { @@ -167,7 +166,7 @@ "source": [ "Here we have specified the `x` axis explicitly, which can be omitted if the Pandas index column is already the desired x axis. Similarly, here we specified the `y` axis; by default all of the non-index columns would be plotted (which would be a lot of data in this case). If you don't specify the 'y' axis, it will have a default label named 'value', but you can then provide a y axis label explicitly using the ``value_label`` option.\n", "\n", - "Putting all of this together we will plot violent crime, robbery, and burglary rates on the y-axis, specifying 'Year' as the x, and relabel the y-axis to display the 'Rate'." + "Putting all of this together we will plot the closing price, daily high and daily low on the y-axis, specifying 'date' as the x, and relabel the y-axis to display the 'Price (USD)'." ] }, { @@ -176,8 +175,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot(x='Year', y=['Violent Crime rate', 'Robbery rate', 'Burglary rate'],\n", - " value_label='Rate (per 100k people)')" + "stocks.hvplot(x='date', y=['close', 'high', 'low'], value_label='Price (USD)')" ] }, { @@ -194,7 +192,7 @@ "Instead of using the ``kind`` argument to the plot call, we can use the ``hvplot`` namespace, which lets us easily discover the range of plot types that are supported. Use tab completion to explore the available plot types:\n", "\n", "```python\n", - "crime.hvplot.\n", + "stocks.hvplot.\n", "```\n", "\n", "Plot types available include:\n", @@ -229,7 +227,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.area(x='Year', y=['Robbery', 'Aggravated assault'])" + "stocks.hvplot.area(x='date', y=['high', 'low'])" ] }, { @@ -245,14 +243,14 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.area(x='Year', y=['Aggravated assault', 'Robbery'], stacked=False, alpha=0.4)" + "stocks.hvplot.area(x='date', y=['high', 'low'], stacked=False, alpha=0.4)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Another use for an area plot is to visualize the spread of a value. For instance using the flights dataset we may want to see the spread in mean delay values across carriers. For that purpose we compute the mean delay by day and carrier and then the min/max mean delay for across all carriers. Since the output of ``hvplot`` is just a regular holoviews object, we can use the overlay operator (\\*) to place the plots on top of each other." + "Another use for an area plot is to visualize the spread of a value over time. For instance using the stocks dataset we may want to see the range of monthly mean closing prices within each year. For that purpose we group by year and month to get the monthly mean close, and then compute the min/max of those monthly means per year. Since the output of ``hvplot`` is just a regular holoviews object, we can use the overlay operator" ] }, { @@ -261,10 +259,13 @@ "metadata": {}, "outputs": [], "source": [ - "delay_min_max = flights.groupby(['day', 'carrier'])['carrier_delay'].mean().groupby('day').agg(['min', 'max'])\n", - "delay_mean = flights.groupby('day')['carrier_delay'].mean()\n", + "stocks['year'] = stocks['date'].dt.year\n", + "stocks['month'] = stocks['date'].dt.month\n", + "monthly = stocks.groupby(['year', 'month'])['close'].mean().reset_index()\n", + "y_min_max = monthly.groupby('year')['close'].agg(['min', 'max'])\n", + "y_mean = monthly.groupby('year')['close'].mean()\n", "\n", - "delay_min_max.hvplot.area(x='day', y='min', y2='max', alpha=0.2) * delay_mean.hvplot()" + "y_min_max.hvplot.area(x='year', y='min', y2='max', alpha=0.2) * y_mean.hvplot()" ] }, { @@ -273,7 +274,7 @@ "source": [ "#### Bars\n", "\n", - "In the simplest case we can use ``.hvplot.bar`` to plot ``x`` against ``y``. We'll use ``rot=90`` to rotate the tick labels on the x-axis making the years easier to read:" + "In the simplest case we can use ``.hvplot.bar`` to plot ``x`` against ``y``. We'll use ``rot=90`` to rotate the tick labels on the x-axis making the dates easier to read:" ] }, { @@ -282,7 +283,8 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.bar(x='Year', y='Violent Crime rate', rot=90)" + "stocks_subset = stocks[:50]\n", + "stocks_subset.hvplot.bar(x='date', y='close', rot=90)" ] }, { @@ -298,7 +300,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.bar(x='Year', y=['Violent crime total', 'Property crime total'],\n", + "stocks_subset.hvplot.bar(x='date', y=['open', 'close'],\n", " stacked=True, rot=90, width=800, legend='top_left')" ] }, @@ -317,7 +319,8 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.scatter(x='Violent Crime rate', y='Burglary rate', c='Year')" + "penguins = penguins('pandas')\n", + "penguins.hvplot.scatter(x='bill_length_mm', y='bill_depth_mm', c='body_mass_g')" ] }, { @@ -342,7 +345,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.step(x='Year', y=['Robbery', 'Aggravated assault'])" + "stocks_subset.hvplot.step(x='date', y=['high', 'low'])" ] }, { @@ -351,7 +354,7 @@ "source": [ "#### HexBins\n", "\n", - "You can create hexagonal bin plots with the ``hexbin`` method. Hexbin plots can be a useful alternative to scatter plots if your data are too dense to plot each point individually. Since these data are not regularly distributed, we'll use the ``logz`` option to map z-axis (color) to a log scale colorbar." + "You can create hexagonal bin plots with the ``hexbin`` method. Hexbin plots can be a useful alternative to scatter plots if your data are too dense to plot each point individually." ] }, { @@ -360,7 +363,7 @@ "metadata": {}, "outputs": [], "source": [ - "flights.hvplot.hexbin(x='airtime', y='arrdelay', width=600, height=500, logz=True)" + "penguins.hvplot.hexbin(x='bill_length_mm', y='bill_depth_mm')" ] }, { @@ -378,7 +381,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.bivariate(x='Violent Crime rate', y='Burglary rate', width=600, height=500)" + "penguins.hvplot.bivariate(x='bill_length_mm', y='bill_depth_mm')" ] }, { @@ -387,7 +390,7 @@ "source": [ "#### HeatMap\n", "\n", - "A ``HeatMap`` lets us view the relationship between three variables, so we specify the 'x' and 'y' variables and an additional 'C' variable. Additionally we can define a ``reduce_function`` that computes the values for each bin from the samples that fall into it. Here we plot the 'depdelay' (i.e. departure delay) for each day of the month and carrier in the dataset:" + "A ``HeatMap`` lets us view the relationship between three variables, so we specify the 'x' and 'y' variables and an additional 'C' variable. Additionally we can define a ``reduce_function`` that computes the values for each bin from the samples that fall into it. Here we compute pairwise correlations between the penguins measurement columns, reshape the correlation matrix to long form, and plot the correlation coefficient for each variable pair:" ] }, { @@ -396,7 +399,19 @@ "metadata": {}, "outputs": [], "source": [ - "flights.compute().hvplot.heatmap(x='day', y='carrier', C='depdelay', reduce_function=np.mean, colorbar=True)" + "corr = penguins[[c for c in penguins.columns if c.split(\"_\")[-1] in (\"mm\", \"g\")]].corr()\n", + "# Convert to long-form for heatmap\n", + "corr_df = corr.stack().reset_index()\n", + "corr_df.columns = ['variable_1', 'variable_2', 'correlation']\n", + "\n", + "corr_df.hvplot.heatmap(\n", + " x='variable_1',\n", + " y='variable_2',\n", + " C='correlation',\n", + " cmap='coolwarm',\n", + " clim=(-1, 1),\n", + " title='Correlation Heatmap',\n", + ")" ] }, { @@ -414,7 +429,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.table(columns=['Year', 'Population', 'Violent Crime rate'], width=400)" + "stocks.hvplot.table(columns=['date', 'close', 'volume'], width=400)" ] }, { @@ -431,7 +446,7 @@ "\n", "#### Histogram\n", "\n", - "The Histogram is the simplest example of a distribution; often we simply plot the distribution of a single variable, in this case the 'Violent Crime rate'. Additionally we can define a range over which to compute the histogram and the number of bins using the ``bin_range`` and ``bins`` arguments respectively:" + "The Histogram is the simplest example of a distribution; often we simply plot the distribution of a single variable, in this case the closing price. Additionally we can define a range over which to compute the histogram and the number of bins using the ``bin_range`` and ``bins`` arguments respectively:" ] }, { @@ -440,7 +455,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.hist(y='Violent Crime rate')" + "stocks.hvplot.hist(y='close')" ] }, { @@ -456,15 +471,15 @@ "metadata": {}, "outputs": [], "source": [ - "columns = ['Violent Crime rate', 'Property crime rate', 'Burglary rate']\n", - "crime.hvplot.hist(y=columns, bins=50, alpha=0.5, legend='top', height=400)" + "columns = ['close', 'high', 'low']\n", + "stocks.hvplot.hist(y=columns, bins=50, alpha=0.5, legend='top', height=400)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can also group the data by another variable. Here we'll use ``subplots`` to split each carrier out into its own plot:" + "We can also group the data by another variable. Here we'll use ``subplots`` to split each cluster category out into its own plot:" ] }, { @@ -473,8 +488,7 @@ "metadata": {}, "outputs": [], "source": [ - "flight_subset = flights[flights.carrier.isin(['AA', 'US', 'OH'])]\n", - "flight_subset.hvplot.hist('depdelay', by='carrier', bins=20, bin_range=(-20, 100), width=300, subplots=True)" + "stocks.hvplot.hist(y=columns, legend='top', width=300, subplots=True)" ] }, { @@ -492,7 +506,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.kde(y='Violent Crime rate')" + "stocks.hvplot.kde(y='close')" ] }, { @@ -508,8 +522,8 @@ "metadata": {}, "outputs": [], "source": [ - "columns=['Violent Crime rate', 'Property crime rate', 'Burglary rate']\n", - "crime.hvplot.kde(y=columns, alpha=0.5, value_label='Rate', legend='top_right')" + "columns=['close', 'high', 'low']\n", + "stocks.hvplot.kde(y=columns, alpha=0.5, value_label='Price (USD)', legend='top_right')" ] }, { @@ -525,8 +539,7 @@ "metadata": {}, "outputs": [], "source": [ - "flight_subset = flights[flights.carrier.isin(['AA', 'US', 'OH'])]\n", - "flight_subset.hvplot.kde('depdelay', by='carrier', xlim=(-20, 70), width=300, subplots=True)" + "penguins.hvplot.kde('body_mass_g', by='species', width=300, subplots=True)" ] }, { @@ -544,7 +557,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.box(y='Violent Crime rate')" + "penguins.hvplot.box(y='body_mass_g')" ] }, { @@ -560,9 +573,8 @@ "metadata": {}, "outputs": [], "source": [ - "columns=['Burglary rate', 'Larceny-theft rate', 'Motor vehicle theft rate',\n", - " 'Property crime rate', 'Violent Crime rate']\n", - "crime.hvplot.box(y=columns, group_label='Crime', legend=False, value_label='Rate (per 100k)', invert=True)" + "columns=['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']\n", + "penguins.hvplot.box(y=columns, legend=False, group_label='Body Part', value_label='Size (mm)', invert=True)" ] }, { @@ -578,8 +590,7 @@ "metadata": {}, "outputs": [], "source": [ - "flight_subset = flights[flights.carrier.isin(['AA', 'US', 'OH'])]\n", - "flight_subset.hvplot.box('depdelay', by='carrier', ylim=(-10, 70))" + "penguins.hvplot.box(y='body_mass_g', by='species')" ] }, { @@ -603,7 +614,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot(x='Year', y='Violent Crime rate') * crime.hvplot.scatter(x='Year', y='Violent Crime rate', c='k')" + "stocks_subset.hvplot(x='date', y='close') * stocks_subset.hvplot.scatter(x='date', y='close', c='k')" ] }, { @@ -619,8 +630,8 @@ "metadata": {}, "outputs": [], "source": [ - "(crime.hvplot.bar(x='Year', y='Violent Crime rate', rot=90, width=550) +\n", - " crime.hvplot.table(['Year', 'Population', 'Violent Crime rate'], width=420))" + "(stocks_subset.hvplot.bar(x='date', y='close', rot=90, width=550) +\n", + " stocks_subset.hvplot.table(['date', 'close', 'volume'], width=420))" ] }, { @@ -629,7 +640,7 @@ "source": [ "## Large data\n", "\n", - "The previous examples summarized the fairly large airline dataset using statistical plot types that aggregate the data into a feasible subset for plotting. We can instead aggregate the data directly into the viewable image using [datashader](https://datashader.org), which provides a rendering of the entire set of raw data available (as far as the resolution of the screen allows). Here we plot the 'airtime' against the 'distance':" + "The previous examples summarized the fairly large clusters dataset using statistical plot types that aggregate the data into a feasible subset for plotting. We can instead aggregate the data directly into the viewable image using [datashader](https://datashader.org), which provides a rendering of the entire set of raw data available (as far as the resolution of the screen allows). Here we plot the 'x' against the 'y' coordinates of the clusters:" ] }, { @@ -638,7 +649,7 @@ "metadata": {}, "outputs": [], "source": [ - "flights.hvplot.scatter(x='distance', y='airtime', datashade=True)" + "clusters.hvplot.scatter(x='x', y='y', by='cat', datashade=True)" ] }, { @@ -647,7 +658,7 @@ "source": [ "## Groupby\n", "\n", - "Thanks to the ability of HoloViews to explore a parameter space with a set of widgets we can apply a groupby along a particular column or dimension. For example we can view the distribution of departure delays by carrier grouped by day, allowing the user to choose which day to display:" + "Thanks to the ability of HoloViews to explore a parameter space with a set of widgets we can apply a groupby along a particular column or dimension. For example we can view the distribution of the penguins body mass by species grouped by year, allowing the user to choose which value group to display:" ] }, { @@ -656,7 +667,7 @@ "metadata": {}, "outputs": [], "source": [ - "flights.hvplot.violin(y='depdelay', by='carrier', groupby='dayofweek', ylim=(-20, 60), height=500)" + "penguins.hvplot.violin(y='body_mass_g', by='species', groupby='year', height=300)" ] }, { diff --git a/doc/user_guide/Plotting_Extensions.ipynb b/doc/user_guide/Plotting_Extensions.ipynb index d05ffb787..21f75710b 100644 --- a/doc/user_guide/Plotting_Extensions.ipynb +++ b/doc/user_guide/Plotting_Extensions.ipynb @@ -15,7 +15,9 @@ "metadata": {}, "outputs": [], "source": [ - "from hvplot.sample_data import us_crime" + "from hvplot.sampledata import apple_stocks\n", + "\n", + "df = apple_stocks('pandas')" ] }, { @@ -57,7 +59,7 @@ "metadata": {}, "outputs": [], "source": [ - "us_crime.hvplot(x='Year', y='Violent Crime rate')" + "df.hvplot(x='date', y='close')" ] }, { @@ -79,7 +81,7 @@ "source": [ "hvplot.output(backend='plotly')\n", "\n", - "us_crime.hvplot(x='Year', y='Violent Crime rate')" + "df.hvplot(x='date', y='close')" ] }, { @@ -110,7 +112,7 @@ "outputs": [], "source": [ "hvplot.output(backend='bokeh')\n", - "us_crime.hvplot(x='Year', y='Violent Crime rate', line_dash='dashed')" + "df.hvplot(x='date', y='close', line_dash='dashed')" ] }, { @@ -121,7 +123,7 @@ "outputs": [], "source": [ "hvplot.output(backend='matplotlib')\n", - "us_crime.hvplot(x='Year', y='Violent Crime rate', linestyle='dashed')" + "df.hvplot(x='date', y='close', linestyle='dashed')" ] }, { @@ -140,8 +142,8 @@ "outputs": [], "source": [ "hvplot.extension('matplotlib', compatibility='bokeh')\n", - "violent_crime = us_crime.hvplot(x='Year', y='Violent Crime rate', line_dash='dashed')\n", - "violent_crime" + "stocks_plot = df.hvplot(x='date', y='close', line_dash='dashed')\n", + "stocks_plot" ] }, { @@ -151,7 +153,7 @@ "metadata": {}, "outputs": [], "source": [ - "violent_crime.opts.info()" + "stocks_plot.opts.info()" ] }, { @@ -175,7 +177,7 @@ "id": "706832e4", "metadata": {}, "source": [ - "In some cases it can be convenient to construct a plot with hvPlot and then get a handle on the figure object of the underlying plotting library to further customize the plot or to embed it in some more complex application. The `render` function allows to get a handle on the figure object. The following examples show that it's possible to use the API of Bokeh, Matplotlib or Plotly to update the title of the `violent_crime` plot." + "In some cases it can be convenient to construct a plot with hvPlot and then get a handle on the figure object of the underlying plotting library to further customize the plot or to embed it in some more complex application. The `render` function allows to get a handle on the figure object. The following examples show that it's possible to use the API of Bokeh, Matplotlib or Plotly to update the title of the `stocks_plot` plot." ] }, { @@ -185,7 +187,7 @@ "metadata": {}, "outputs": [], "source": [ - "violent_crime = us_crime.hvplot(x='Year', y='Violent Crime rate')" + "stocks_plot = df.hvplot(x='date', y='close')" ] }, { @@ -197,8 +199,8 @@ "source": [ "from bokeh.io import show\n", "\n", - "bk_fig = hvplot.render(violent_crime, backend='bokeh')\n", - "bk_fig.title = 'Violent crime'\n", + "bk_fig = hvplot.render(stocks_plot, backend='bokeh')\n", + "bk_fig.title = 'Apple stocks'\n", "show(bk_fig)" ] }, @@ -210,9 +212,9 @@ "outputs": [], "source": [ "%matplotlib inline\n", - "mpl_fig = hvplot.render(violent_crime, backend='matplotlib')\n", + "mpl_fig = hvplot.render(stocks_plot, backend='matplotlib')\n", "axes = mpl_fig.get_axes()\n", - "axes[0].set_title('Violent crime')\n", + "axes[0].set_title('Apple stocks')\n", "mpl_fig" ] }, @@ -225,8 +227,8 @@ "source": [ "from plotly.graph_objects import Figure\n", "\n", - "plotly_fig = hvplot.render(violent_crime, backend='plotly')\n", - "fig = Figure(plotly_fig).update_layout(title='Violent crime')\n", + "plotly_fig = hvplot.render(stocks_plot, backend='plotly')\n", + "fig = Figure(plotly_fig).update_layout(title='Apple stocks')\n", "fig" ] }, diff --git a/doc/user_guide/Plotting_with_Matplotlib.ipynb b/doc/user_guide/Plotting_with_Matplotlib.ipynb index 7b8fae8ee..4a242bf13 100644 --- a/doc/user_guide/Plotting_with_Matplotlib.ipynb +++ b/doc/user_guide/Plotting_with_Matplotlib.ipynb @@ -43,7 +43,6 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", "import hvplot.pandas # noqa\n", "import hvplot.dask # noqa\n", "\n", @@ -60,10 +59,10 @@ "\n", "We will be focusing on two different datasets:\n", "\n", - "- A small CSV file of US crime data, broken down by state\n", - "- A larger Parquet-format file of airline flight data\n", + "- A small CSV file of Apple Inc. (AAPL) daily stock prices\n", + "- A larger synthetic dataset of points organised into 5 clusters\n", "\n", - "The ``hvplot.sample_data`` module makes these datasets Intake data catalogue, which we can load either using pandas:" + "The ``hvplot.sampledata`` module provides access to both datasets via the ``hvsampledata`` package, which we can load either using pandas:" ] }, { @@ -72,11 +71,11 @@ "metadata": {}, "outputs": [], "source": [ - "from hvplot.sample_data import us_crime, airline_flights\n", + "from hvplot.sampledata import apple_stocks, synthetic_clusters, penguins\n", "\n", - "crime = us_crime.read()\n", - "print(type(crime))\n", - "crime.head()" + "stocks = apple_stocks('pandas')\n", + "print(type(stocks))\n", + "stocks.head()" ] }, { @@ -92,9 +91,9 @@ "metadata": {}, "outputs": [], "source": [ - "flights = airline_flights.to_dask().persist()\n", - "print(type(flights))\n", - "flights.head()" + "clusters = synthetic_clusters('dask', lazy=True)\n", + "print(type(clusters))\n", + "clusters.head()" ] }, { @@ -108,7 +107,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The ``dask.dataframe.DataFrame.hvplot``, ``pandas.DataFrame.hvplot`` and ``intake.DataSource.plot`` interfaces (and Series equivalents) from HvPlot provide a powerful high-level API to generate complex plots. The ``.hvplot`` API can be called directly or used as a namespace to generate specific plot types." + "The ``dask.dataframe.DataFrame.hvplot`` and ``pandas.DataFrame.hvplot`` interfaces (and Series equivalents) from HvPlot provide a powerful high-level API to generate complex plots. The ``.hvplot`` API can be called directly or used as a namespace to generate specific plot types." ] }, { @@ -131,7 +130,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.line(x='Year', y='Violent Crime rate')" + "stocks.hvplot.line(x='date', y='close')" ] }, { @@ -147,14 +146,14 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot(x='Year', y='Violent Crime rate', kind='scatter')" + "stocks.hvplot(x='date', y='close', kind='scatter')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "To group the data by one or more additional columns, specify an additional ``by`` variable. As an example here we will plot the departure delay ('depdelay') as a function of 'distance', grouping the data by the 'carrier'. There are many available carriers, so we will select only two of them so that the plot is readable:" + "To group the data by one or more additional columns, specify an additional ``by`` variable. As an example here we will plot the y coordinate as a function of x, grouping the data by cluster category ('cat'). There are 5 clusters available, so we will select only two of them so that the plot is readable:" ] }, { @@ -163,8 +162,8 @@ "metadata": {}, "outputs": [], "source": [ - "flight_subset = flights[flights.carrier.isin(['OH', 'F9'])]\n", - "flight_subset.hvplot(x='distance', y='depdelay', by='carrier', kind='scatter', alpha=0.2, persist=True)" + "cluster_subset = clusters[clusters.cat.isin(['d1', 'd2'])].compute()\n", + "cluster_subset.hvplot(x='x', y='y', by='cat', kind='scatter', alpha=0.2)" ] }, { @@ -173,7 +172,7 @@ "source": [ "Here we have specified the `x` axis explicitly, which can be omitted if the Pandas index column is already the desired x axis. Similarly, here we specified the `y` axis; by default all of the non-index columns would be plotted (which would be a lot of data in this case). If you don't specify the 'y' axis, it will have a default label named 'value', but you can then provide a y axis label explicitly using the ``value_label`` option.\n", "\n", - "Putting all of this together we will plot violent crime, robbery, and burglary rates on the y-axis, specifying 'Year' as the x, and relabel the y-axis to display the 'Rate'." + "Putting all of this together we will plot the opening and closing prices on the y-axis, specifying 'date' as the x, and relabel the y-axis to display the 'Price (USD)'." ] }, { @@ -182,8 +181,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot(x='Year', y=['Violent Crime rate', 'Robbery rate', 'Burglary rate'],\n", - " value_label='Rate (per 100k people)')" + "stocks.hvplot(x='date', y=['open', 'close'], value_label='Price (USD)')" ] }, { @@ -200,7 +198,7 @@ "Instead of using the ``kind`` argument to the plot call, we can use the ``hvplot`` namespace, which lets us easily discover the range of plot types that are supported. Use tab completion to explore the available plot types:\n", "\n", "```python\n", - "crime.hvplot.\n", + "stocks.hvplot.\n", "```\n", "\n", "Plot types available include:\n", @@ -235,7 +233,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.area(x='Year', y=['Robbery', 'Aggravated assault'])" + "stocks.hvplot.area(x='date', y=['high', 'low'])" ] }, { @@ -251,14 +249,14 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.area(x='Year', y=['Aggravated assault', 'Robbery'], stacked=False, alpha=0.4)" + "stocks.hvplot.area(x='date', y=['high', 'low'], stacked=False, alpha=0.3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Another use for an area plot is to visualize the spread of a value. For instance using the flights dataset we may want to see the spread in mean delay values across carriers. For that purpose we compute the mean delay by day and carrier and then the min/max mean delay for across all carriers. Since the output of ``hvplot`` is just a regular holoviews object, we can use the overlay operator (\\*) to place the plots on top of each other." + "Another use for an area plot is to visualize the spread of a value over time. For instance using the stocks dataset we may want to see the range of monthly mean closing prices within each year. For that purpose we group by year and month to get the monthly mean close, and then compute the min/max of those monthly means per year. Since the output of ``hvplot`` is just a regular holoviews object, we can use the overlay operator" ] }, { @@ -267,10 +265,13 @@ "metadata": {}, "outputs": [], "source": [ - "delay_min_max = flights.groupby(['day', 'carrier'])['carrier_delay'].mean().groupby('day').agg(['min', 'max'])\n", - "delay_mean = flights.groupby('day')['carrier_delay'].mean()\n", + "stocks['year'] = stocks['date'].dt.year\n", + "stocks['month'] = stocks['date'].dt.month\n", + "monthly = stocks.groupby(['year', 'month'])['close'].mean().reset_index()\n", + "y_min_max = monthly.groupby('year')['close'].agg(['min', 'max'])\n", + "y_mean = monthly.groupby('year')['close'].mean()\n", "\n", - "delay_min_max.hvplot.area(x='day', y='min', y2='max', alpha=0.2) * delay_mean.hvplot()" + "y_min_max.hvplot.area(x='year', y='min', y2='max', alpha=0.2) * y_mean.hvplot()" ] }, { @@ -279,7 +280,7 @@ "source": [ "#### Bars\n", "\n", - "In the simplest case we can use ``.hvplot.bar`` to plot ``x`` against ``y``. We'll use ``rot=90`` to rotate the tick labels on the x-axis making the years easier to read:" + "In the simplest case we can use ``.hvplot.bar`` to plot ``x`` against ``y``. We'll use ``rot=90`` to rotate the tick labels on the x-axis making the dates easier to read:" ] }, { @@ -288,7 +289,8 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.bar(x='Year', y='Violent Crime rate', rot=90, width=900)" + "stocks_subset = stocks[:50]\n", + "stocks_subset.hvplot.bar(x='date', y='close', rot=90, width=900)" ] }, { @@ -304,7 +306,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.bar(x='Year', y=['Violent crime total', 'Property crime total'],\n", + "stocks_subset.hvplot.bar(x='date', y=['open', 'close'],\n", " stacked=True, rot=90, width=900, legend='top_left')" ] }, @@ -323,7 +325,8 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.scatter(x='Violent Crime rate', y='Burglary rate', c='Year')" + "penguins = penguins('pandas')\n", + "penguins.hvplot.scatter(x='bill_length_mm', y='bill_depth_mm', c='body_mass_g')" ] }, { @@ -348,7 +351,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.step(x='Year', y=['Robbery', 'Aggravated assault'])" + "stocks_subset.hvplot.step(x='date', y=['high', 'low'], rot=45)" ] }, { @@ -357,7 +360,7 @@ "source": [ "#### HexBins\n", "\n", - "You can create hexagonal bin plots with the ``hexbin`` method. Hexbin plots can be a useful alternative to scatter plots if your data are too dense to plot each point individually. Since these data are not regularly distributed, we'll use the ``logz`` option to map z-axis (color) to a log scale colorbar." + "You can create hexagonal bin plots with the ``hexbin`` method. Hexbin plots can be a useful alternative to scatter plots if your data are too dense to plot each point individually." ] }, { @@ -366,16 +369,7 @@ "metadata": {}, "outputs": [], "source": [ - "# flights.hvplot.hexbin(x='airtime', y='arrdelay', width=600, height=500, logz=True);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "Output suppressed as this is not currently supported with the Matplotlib backend and doesn't display any plot.\n", - "
" + "penguins.hvplot.hexbin(x='bill_length_mm', y='bill_depth_mm')" ] }, { @@ -393,7 +387,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.bivariate(x='Violent Crime rate', y='Burglary rate', width=600, height=500)" + "penguins.hvplot.bivariate(x='bill_length_mm', y='bill_depth_mm')" ] }, { @@ -402,7 +396,7 @@ "source": [ "#### HeatMap\n", "\n", - "A ``HeatMap`` lets us view the relationship between three variables, so we specify the 'x' and 'y' variables and an additional 'C' variable. Additionally we can define a ``reduce_function`` that computes the values for each bin from the samples that fall into it. Here we plot the 'depdelay' (i.e. departure delay) for each day of the month and carrier in the dataset:" + "A ``HeatMap`` lets us view the relationship between three variables, so we specify the 'x' and 'y' variables and an additional 'C' variable. Additionally we can define a ``reduce_function`` that computes the values for each bin from the samples that fall into it. Here we compute pairwise correlations between the penguins measurement columns, reshape the correlation matrix to long form, and plot the correlation coefficient for each variable pair:" ] }, { @@ -411,7 +405,19 @@ "metadata": {}, "outputs": [], "source": [ - "flights.compute().hvplot.heatmap(x='day', y='carrier', C='depdelay', reduce_function=np.mean).opts(show_values=False)" + "corr = penguins[[c for c in penguins.columns if c.split(\"_\")[-1] in (\"mm\", \"g\")]].corr()\n", + "# Convert to long-form for heatmap\n", + "corr_df = corr.stack().reset_index()\n", + "corr_df.columns = ['variable_1', 'variable_2', 'correlation']\n", + "\n", + "corr_df.hvplot.heatmap(\n", + " x='variable_1',\n", + " y='variable_2',\n", + " C='correlation',\n", + " cmap='coolwarm',\n", + " clim=(-1, 1),\n", + " title='Correlation Heatmap',\n", + ").opts(show_values=False)" ] }, { @@ -429,7 +435,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.table(columns=['Year', 'Population', 'Violent Crime rate'], width=400)" + "stocks.hvplot.table(columns=['date', 'close', 'volume'], width=400)" ] }, { @@ -446,7 +452,7 @@ "\n", "#### Histogram\n", "\n", - "The Histogram is the simplest example of a distribution; often we simply plot the distribution of a single variable, in this case the 'Violent Crime rate'. Additionally we can define a range over which to compute the histogram and the number of bins using the ``bin_range`` and ``bins`` arguments respectively:" + "The Histogram is the simplest example of a distribution; often we simply plot the distribution of a single variable, in this case the closing price. Additionally we can define a range over which to compute the histogram and the number of bins using the ``bin_range`` and ``bins`` arguments respectively:" ] }, { @@ -455,7 +461,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.hist(y='Violent Crime rate')" + "stocks.hvplot.hist(y='close')" ] }, { @@ -471,15 +477,15 @@ "metadata": {}, "outputs": [], "source": [ - "columns = ['Violent Crime rate', 'Property crime rate', 'Burglary rate']\n", - "crime.hvplot.hist(y=columns, bins=50, alpha=0.5, legend='top', height=400)" + "columns = ['close', 'high', 'low']\n", + "stocks.hvplot.hist(y=columns, bins=50, alpha=0.5, legend='top', height=400)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can also group the data by another variable. Here we'll use ``subplots`` to split each carrier out into its own plot:" + "We can also group the data by another variable. Here we'll use ``subplots`` to split each cluster category out into its own plot:" ] }, { @@ -488,17 +494,7 @@ "metadata": {}, "outputs": [], "source": [ - "flight_subset = flights[flights.carrier.isin(['AA', 'US', 'OH'])]\n", - "# flight_subset.hvplot.hist('depdelay', by='carrier', bins=20, bin_range=(-20, 100), width=300, subplots=True);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "Output suppressed as this is not currently supported with the Matplotlib backend and raises an error. Tip: execute flight_subset.compute() before plotting to avoid the error.\n", - "
" + "stocks.hvplot.hist(y=columns, legend='top', height=400, subplots=True)" ] }, { @@ -516,7 +512,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.kde(y='Violent Crime rate')" + "stocks.hvplot.kde(y='close')" ] }, { @@ -532,8 +528,8 @@ "metadata": {}, "outputs": [], "source": [ - "columns=['Violent Crime rate', 'Property crime rate', 'Burglary rate']\n", - "crime.hvplot.kde(y=columns, alpha=0.5, value_label='Rate', legend='top_right')" + "columns=['close', 'high', 'low']\n", + "stocks.hvplot.kde(y=columns, alpha=0.3, value_label='Price (USD)', legend='top_right')" ] }, { @@ -549,8 +545,7 @@ "metadata": {}, "outputs": [], "source": [ - "flight_subset = flights[flights.carrier.isin(['AA', 'US', 'OH'])]\n", - "flight_subset.hvplot.kde('depdelay', by='carrier', xlim=(-20, 70), width=300, subplots=True)" + "penguins.hvplot.kde('body_mass_g', by='species', width=300, subplots=True)" ] }, { @@ -568,7 +563,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.box(y='Violent Crime rate')" + "penguins.hvplot.box(y='body_mass_g')" ] }, { @@ -584,9 +579,8 @@ "metadata": {}, "outputs": [], "source": [ - "columns=['Burglary rate', 'Larceny-theft rate', 'Motor vehicle theft rate',\n", - " 'Property crime rate', 'Violent Crime rate']\n", - "crime.hvplot.box(y=columns, group_label='Crime', legend=False, value_label='Rate (per 100k)', invert=True)" + "columns=['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']\n", + "penguins.hvplot.box(y=columns, legend=False, group_label='Body Part', value_label='Size (mm)', invert=True)" ] }, { @@ -602,17 +596,7 @@ "metadata": {}, "outputs": [], "source": [ - "flight_subset = flights[flights.carrier.isin(['AA', 'US', 'OH'])]\n", - "# flight_subset.hvplot.box('depdelay', by='carrier', ylim=(-10, 70));" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "Output suppressed as this is not currently supported with the Matplotlib backend and displays an empty plot.\n", - "
" + "penguins.hvplot.box(y='body_mass_g', by='species')" ] }, { @@ -636,7 +620,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot(x='Year', y='Violent Crime rate') * crime.hvplot.scatter(x='Year', y='Violent Crime rate', c='k')" + "stocks_subset.hvplot(x='date', y='close') * stocks_subset.hvplot.scatter(x='date', y='close', c='k')" ] }, { @@ -652,15 +636,15 @@ "metadata": {}, "outputs": [], "source": [ - "(crime.hvplot.bar(x='Year', y='Violent Crime rate', rot=90, width=550) +\n", - " crime.hvplot.table(['Year', 'Population', 'Violent Crime rate'], width=420))" + "(stocks_subset.hvplot.bar(x='date', y='close', rot=90, width=550) +\n", + " stocks_subset.hvplot.table(['date', 'close', 'volume'], width=420))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can further enhance the plot by using a `step`s chart instead of a `bar` chart to avoid visual clutter and better represent the continuous nature of time series data. Additionally, we can display a table summarizing decade-averaged statistics to provide a quick overview of the data. The `sublabel` format and size can be customized, and if not needed, can be removed by setting `sublabel_format=\"\"` in [opts](https://holoviews.org/user_guide/Applying_Customizations.html)." + "We can further enhance the plot by using a `step` chart instead of a `bar` chart to avoid visual clutter and better represent the continuous nature of time series data. Additionally, we can display a table summarizing yearly-averaged statistics to provide a quick overview of the data. The `sublabel` format and size can be customized, and if not needed, can be removed by setting `sublabel_format=\"\"` in [opts](https://holoviews.org/user_guide/Applying_Customizations.html)." ] }, { @@ -670,16 +654,15 @@ "outputs": [], "source": [ "tabledata = (\n", - " crime.assign(\n", + " stocks_subset.assign(\n", " **{\n", - " \"Decade\": (crime.Year // 10) * 10, # Calculate decade for each year\n", - " \"Population [mi]\": crime.Population / 1e6, # Population in millions\n", + " \"Year\": stocks.date.dt.year,\n", " }\n", " )\n", - " .groupby(\"Decade\") # Group data by decade\n", - " .mean() # Calculate mean values for each decade group\n", - " [[\"Population [mi]\", \"Violent Crime rate\"]] # Select desired columns\n", - " .astype(\"int\") # Convert selected columns to integers\n", + " .groupby(\"Year\") # Group data by year\n", + " .mean(numeric_only=True) # Calculate mean values for each year group\n", + " [[\"close\", \"volume\"]] # Select desired columns\n", + " .astype({\"volume\": \"int\"}) # Convert volume to integer\n", ")" ] }, @@ -690,8 +673,8 @@ "outputs": [], "source": [ "(\n", - " crime.hvplot.step(x='Year', y='Violent Crime rate', rot=90, width=550)+\n", - " tabledata.hvplot.table(['Decade', 'Population [mi]', 'Violent Crime rate'], width=15, height=10)\n", + " stocks_subset.hvplot.step(x='date', y='close', rot=90, width=550)+\n", + " tabledata.hvplot.table(['Year', 'close', 'volume'], width=15, height=10)\n", ").opts(sublabel_format=\"{alpha})\", sublabel_size=14)" ] }, @@ -701,7 +684,7 @@ "source": [ "## Large data\n", "\n", - "The previous examples summarized the fairly large airline dataset using statistical plot types that aggregate the data into a feasible subset for plotting. We can instead aggregate the data directly into the viewable image using [datashader](https://datashader.org), which provides a rendering of the entire set of raw data available (as far as the resolution of the screen allows). Here we plot the 'airtime' against the 'distance':" + "The previous examples summarized the fairly large clusters dataset using statistical plot types that aggregate the data into a feasible subset for plotting. We can instead aggregate the data directly into the viewable image using [datashader](https://datashader.org), which provides a rendering of the entire set of raw data available (as far as the resolution of the screen allows). Here we plot the 'x' against the 'y' coordinates of the clusters:" ] }, { @@ -710,7 +693,7 @@ "metadata": {}, "outputs": [], "source": [ - "flights.hvplot.scatter(x='distance', y='airtime', datashade=True)" + "clusters.hvplot.scatter(x='x', y='y', by='cat', datashade=True)" ] }, { @@ -719,7 +702,7 @@ "source": [ "## Groupby\n", "\n", - "Thanks to the ability of HoloViews to explore a parameter space with a set of widgets we can apply a groupby along a particular column or dimension. For example we can view the distribution of departure delays by carrier grouped by day, allowing the user to choose which day to display:" + "Thanks to the ability of HoloViews to explore a parameter space with a set of widgets we can apply a groupby along a particular column or dimension. For example we can view the distribution of the penguins body mass by species grouped by year, allowing the user to choose which value group to display:" ] }, { @@ -728,7 +711,7 @@ "metadata": {}, "outputs": [], "source": [ - "flights.hvplot.violin(y='depdelay', by='carrier', groupby='dayofweek', ylim=(-20, 60), height=500)" + "penguins.hvplot.violin(y='body_mass_g', by='species', groupby='year', height=300)" ] }, { diff --git a/doc/user_guide/Plotting_with_Plotly.ipynb b/doc/user_guide/Plotting_with_Plotly.ipynb index b229bb2a7..6d7f732a4 100644 --- a/doc/user_guide/Plotting_with_Plotly.ipynb +++ b/doc/user_guide/Plotting_with_Plotly.ipynb @@ -43,7 +43,6 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", "import hvplot.pandas # noqa\n", "import hvplot.dask # noqa\n", "\n", @@ -60,10 +59,10 @@ "\n", "We will be focusing on two different datasets:\n", "\n", - "- A small CSV file of US crime data, broken down by state\n", - "- A larger Parquet-format file of airline flight data\n", + "- A small CSV file of Apple Inc. (AAPL) daily stock prices\n", + "- A larger synthetic dataset of points organised into 5 clusters\n", "\n", - "The ``hvplot.sample_data`` module makes these datasets Intake data catalogue, which we can load either using pandas:" + "The ``hvplot.sampledata`` module provides access to both datasets via the ``hvsampledata`` package, which we can load either using pandas:" ] }, { @@ -72,11 +71,11 @@ "metadata": {}, "outputs": [], "source": [ - "from hvplot.sample_data import us_crime, airline_flights\n", + "from hvplot.sampledata import apple_stocks, synthetic_clusters, penguins\n", "\n", - "crime = us_crime.read()\n", - "print(type(crime))\n", - "crime.head()" + "stocks = apple_stocks('pandas')\n", + "print(type(stocks))\n", + "stocks.head()" ] }, { @@ -92,9 +91,9 @@ "metadata": {}, "outputs": [], "source": [ - "flights = airline_flights.to_dask().persist()\n", - "print(type(flights))\n", - "flights.head()" + "clusters = synthetic_clusters('dask', lazy=True)\n", + "print(type(clusters))\n", + "clusters.head()" ] }, { @@ -108,7 +107,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The ``dask.dataframe.DataFrame.hvplot``, ``pandas.DataFrame.hvplot`` and ``intake.DataSource.plot`` interfaces (and Series equivalents) from HvPlot provide a powerful high-level API to generate complex plots. The ``.hvplot`` API can be called directly or used as a namespace to generate specific plot types." + "The ``dask.dataframe.DataFrame.hvplot`` and ``pandas.DataFrame.hvplot`` interfaces (and Series equivalents) from HvPlot provide a powerful high-level API to generate complex plots. The ``.hvplot`` API can be called directly or used as a namespace to generate specific plot types." ] }, { @@ -131,7 +130,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.line(x='Year', y='Violent Crime rate')" + "stocks.hvplot.line(x='date', y='close')" ] }, { @@ -147,14 +146,14 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot(x='Year', y='Violent Crime rate', kind='scatter')" + "stocks.hvplot(x='date', y='close', kind='scatter')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "To group the data by one or more additional columns, specify an additional ``by`` variable. As an example here we will plot the departure delay ('depdelay') as a function of 'distance', grouping the data by the 'carrier'. There are many available carriers, so we will select only two of them so that the plot is readable:" + "To group the data by one or more additional columns, specify an additional ``by`` variable. As an example here we will plot the y coordinate as a function of x, grouping the data by cluster category ('cat'). There are 5 clusters available, so we will select only two of them so that the plot is readable:" ] }, { @@ -163,8 +162,8 @@ "metadata": {}, "outputs": [], "source": [ - "flight_subset = flights[flights.carrier.isin(['OH', 'F9'])]\n", - "flight_subset.hvplot(x='distance', y='depdelay', by='carrier', kind='scatter', alpha=0.2, persist=True)" + "cluster_subset = clusters[clusters.cat.isin(['d1', 'd2'])].compute()\n", + "cluster_subset.hvplot(x='x', y='y', by='cat', kind='scatter')" ] }, { @@ -173,7 +172,7 @@ "source": [ "Here we have specified the `x` axis explicitly, which can be omitted if the Pandas index column is already the desired x axis. Similarly, here we specified the `y` axis; by default all of the non-index columns would be plotted (which would be a lot of data in this case). If you don't specify the 'y' axis, it will have a default label named 'value', but you can then provide a y axis label explicitly using the ``value_label`` option.\n", "\n", - "Putting all of this together we will plot violent crime, robbery, and burglary rates on the y-axis, specifying 'Year' as the x, and relabel the y-axis to display the 'Rate'." + "Putting all of this together we will plot the closing price, daily high and daily low on the y-axis, specifying 'date' as the x, and relabel the y-axis to display the 'Price (USD)'." ] }, { @@ -182,8 +181,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot(x='Year', y=['Violent Crime rate', 'Robbery rate', 'Burglary rate'],\n", - " value_label='Rate (per 100k people)')" + "stocks.hvplot(x='date', y=['open', 'close'], value_label='Price (USD)')" ] }, { @@ -200,7 +198,7 @@ "Instead of using the ``kind`` argument to the plot call, we can use the ``hvplot`` namespace, which lets us easily discover the range of plot types that are supported. Use tab completion to explore the available plot types:\n", "\n", "```python\n", - "crime.hvplot.\n", + "stocks.hvplot.\n", "```\n", "\n", "Plot types available include:\n", @@ -235,14 +233,14 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.area(x='Year', y=['Robbery', 'Aggravated assault'])" + "stocks.hvplot.area(x='date', y=['high', 'low'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can also explicitly set ``stacked`` to False and define an ``alpha`` value to compare the values directly:" + "We can also explicitly set ``stacked`` to False to compare the values directly:" ] }, { @@ -251,14 +249,14 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.area(x='Year', y=['Aggravated assault', 'Robbery'], stacked=False, alpha=0.4)" + "stocks.hvplot.area(x='date', y=['high', 'low'], stacked=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Another use for an area plot is to visualize the spread of a value. For instance using the flights dataset we may want to see the spread in mean delay values across carriers. For that purpose we compute the mean delay by day and carrier and then the min/max mean delay for across all carriers. Since the output of ``hvplot`` is just a regular holoviews object, we can use the overlay operator (\\*) to place the plots on top of each other." + "Another use for an area plot is to visualize the spread of a value over time. For instance using the stocks dataset we may want to see the range of monthly mean closing prices within each year. For that purpose we group by year and month to get the monthly mean close, and then compute the min/max of those monthly means per year. Since the output of ``hvplot`` is just a regular holoviews object, we can use the overlay operator" ] }, { @@ -267,10 +265,13 @@ "metadata": {}, "outputs": [], "source": [ - "delay_min_max = flights.groupby(['day', 'carrier'])['carrier_delay'].mean().groupby('day').agg(['min', 'max'])\n", - "delay_mean = flights.groupby('day')['carrier_delay'].mean()\n", + "stocks['year'] = stocks['date'].dt.year\n", + "stocks['month'] = stocks['date'].dt.month\n", + "monthly = stocks.groupby(['year', 'month'])['close'].mean().reset_index()\n", + "y_min_max = monthly.groupby('year')['close'].agg(['min', 'max'])\n", + "y_mean = monthly.groupby('year')['close'].mean()\n", "\n", - "delay_min_max.hvplot.area(x='day', y='min', y2='max', alpha=0.2) * delay_mean.hvplot()" + "y_min_max.hvplot.area(x='year', y='min', y2='max') * y_mean.hvplot()" ] }, { @@ -279,7 +280,7 @@ "source": [ "#### Bars\n", "\n", - "In the simplest case we can use ``.hvplot.bar`` to plot ``x`` against ``y``. We'll use ``rot=90`` to rotate the tick labels on the x-axis making the years easier to read:" + "In the simplest case we can use ``.hvplot.bar`` to plot ``x`` against ``y``. We'll use ``rot=90`` to rotate the tick labels on the x-axis making the dates easier to read:" ] }, { @@ -288,7 +289,8 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.bar(x='Year', y='Violent Crime rate', rot=90)" + "stocks_subset = stocks[:50]\n", + "stocks_subset.hvplot.bar(x='date', y='close', rot=90)" ] }, { @@ -304,7 +306,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.bar(x='Year', y=['Violent crime total', 'Property crime total'],\n", + "stocks_subset.hvplot.bar(x='date', y=['open', 'close'],\n", " stacked=True, rot=90, width=800, legend='top_left')" ] }, @@ -323,7 +325,8 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.scatter(x='Violent Crime rate', y='Burglary rate', c='Year')" + "penguins = penguins('pandas')\n", + "penguins.hvplot.scatter(x='bill_length_mm', y='bill_depth_mm', c='body_mass_g')" ] }, { @@ -348,7 +351,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.step(x='Year', y=['Robbery', 'Aggravated assault'])" + "stocks_subset.hvplot.step(x='date', y=['high', 'low'])" ] }, { @@ -357,7 +360,7 @@ "source": [ "#### HexBins\n", "\n", - "You can create hexagonal bin plots with the ``hexbin`` method. Hexbin plots can be a useful alternative to scatter plots if your data are too dense to plot each point individually. Since these data are not regularly distributed, we'll use the ``logz`` option to map z-axis (color) to a log scale colorbar." + "You can create hexagonal bin plots with the ``hexbin`` method. Hexbin plots can be a useful alternative to scatter plots if your data are too dense to plot each point individually." ] }, { @@ -366,7 +369,7 @@ "metadata": {}, "outputs": [], "source": [ - "# flights.hvplot.hexbin(x='airtime', y='arrdelay', width=600, height=500, logz=True)" + "# penguins.hvplot.hexbin(x='bill_length_mm', y='bill_depth_mm')" ] }, { @@ -393,7 +396,7 @@ "metadata": {}, "outputs": [], "source": [ - "# crime.hvplot.bivariate(x='Violent Crime rate', y='Burglary rate', width=600, height=500)" + "# penguins.hvplot.bivariate(x='bill_length_mm', y='bill_depth_mm')" ] }, { @@ -411,7 +414,7 @@ "source": [ "#### HeatMap\n", "\n", - "A ``HeatMap`` lets us view the relationship between three variables, so we specify the 'x' and 'y' variables and an additional 'C' variable. Additionally we can define a ``reduce_function`` that computes the values for each bin from the samples that fall into it. Here we plot the 'depdelay' (i.e. departure delay) for each day of the month and carrier in the dataset:" + "A ``HeatMap`` lets us view the relationship between three variables, so we specify the 'x' and 'y' variables and an additional 'C' variable. Additionally we can define a ``reduce_function`` that computes the values for each bin from the samples that fall into it. Here we compute pairwise correlations between the penguins measurement columns, reshape the correlation matrix to long form, and plot the correlation coefficient for each variable pair:" ] }, { @@ -420,7 +423,19 @@ "metadata": {}, "outputs": [], "source": [ - "flights.compute().hvplot.heatmap(x='day', y='carrier', C='depdelay', reduce_function=np.mean, colorbar=True)" + "corr = penguins[[c for c in penguins.columns if c.split(\"_\")[-1] in (\"mm\", \"g\")]].corr()\n", + "# Convert to long-form for heatmap\n", + "corr_df = corr.stack().reset_index()\n", + "corr_df.columns = ['variable_1', 'variable_2', 'correlation']\n", + "\n", + "corr_df.hvplot.heatmap(\n", + " x='variable_1',\n", + " y='variable_2',\n", + " C='correlation',\n", + " cmap='coolwarm',\n", + " clim=(-1, 1),\n", + " title='Correlation Heatmap',\n", + ")" ] }, { @@ -438,7 +453,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.table(columns=['Year', 'Population', 'Violent Crime rate'], width=400)" + "stocks.hvplot.table(columns=['date', 'close', 'volume'], width=400)" ] }, { @@ -455,7 +470,7 @@ "\n", "#### Histogram\n", "\n", - "The Histogram is the simplest example of a distribution; often we simply plot the distribution of a single variable, in this case the 'Violent Crime rate'. Additionally we can define a range over which to compute the histogram and the number of bins using the ``bin_range`` and ``bins`` arguments respectively:" + "The Histogram is the simplest example of a distribution; often we simply plot the distribution of a single variable, in this case the closing price. Additionally we can define a range over which to compute the histogram and the number of bins using the ``bin_range`` and ``bins`` arguments respectively:" ] }, { @@ -464,7 +479,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.hist(y='Violent Crime rate')" + "stocks.hvplot.hist(y='close')" ] }, { @@ -480,15 +495,15 @@ "metadata": {}, "outputs": [], "source": [ - "columns = ['Violent Crime rate', 'Property crime rate', 'Burglary rate']\n", - "crime.hvplot.hist(y=columns, bins=50, alpha=0.5, legend='top', height=400)" + "columns = ['close', 'high', 'low']\n", + "stocks.hvplot.hist(y=columns, bins=50, legend='top', height=400)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can also group the data by another variable. Here we'll use ``subplots`` to split each carrier out into its own plot:" + "We can also group the data by another variable. Here we'll use ``subplots`` to split each cluster category out into its own plot:" ] }, { @@ -497,8 +512,7 @@ "metadata": {}, "outputs": [], "source": [ - "flight_subset = flights[flights.carrier.isin(['AA', 'US', 'OH'])]\n", - "flight_subset.hvplot.hist('depdelay', by='carrier', bins=20, bin_range=(-20, 100), width=300, subplots=True)" + "stocks.hvplot.hist(y=columns, legend='top', height=400, subplots=True)" ] }, { @@ -516,7 +530,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.kde(y='Violent Crime rate')" + "stocks.hvplot.kde(y='close')" ] }, { @@ -532,8 +546,8 @@ "metadata": {}, "outputs": [], "source": [ - "columns=['Violent Crime rate', 'Property crime rate', 'Burglary rate']\n", - "crime.hvplot.kde(y=columns, alpha=0.5, value_label='Rate', legend='top_right')" + "columns=['close', 'high', 'low']\n", + "stocks.hvplot.kde(y=columns, value_label='Price (USD)', legend='top_right')" ] }, { @@ -549,8 +563,7 @@ "metadata": {}, "outputs": [], "source": [ - "flight_subset = flights[flights.carrier.isin(['AA', 'US', 'OH'])]\n", - "flight_subset.hvplot.kde('depdelay', by='carrier', xlim=(-20, 70), width=300, subplots=True)" + "penguins.hvplot.kde('body_mass_g', by='species', width=300, subplots=True)" ] }, { @@ -568,7 +581,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot.box(y='Violent Crime rate')" + "penguins.hvplot.box(y='body_mass_g')" ] }, { @@ -584,9 +597,8 @@ "metadata": {}, "outputs": [], "source": [ - "columns=['Burglary rate', 'Larceny-theft rate', 'Motor vehicle theft rate',\n", - " 'Property crime rate', 'Violent Crime rate']\n", - "crime.hvplot.box(y=columns, group_label='Crime', legend=False, value_label='Rate (per 100k)', invert=True)" + "columns=['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']\n", + "penguins.hvplot.box(y=columns, legend=False, group_label='Body Part', value_label='Size (mm)', invert=True)" ] }, { @@ -602,8 +614,7 @@ "metadata": {}, "outputs": [], "source": [ - "flight_subset = flights[flights.carrier.isin(['AA', 'US', 'OH'])]\n", - "flight_subset.hvplot.box('depdelay', by='carrier', ylim=(-10, 70))" + "penguins.hvplot.box(y='body_mass_g', by='species')" ] }, { @@ -627,7 +638,7 @@ "metadata": {}, "outputs": [], "source": [ - "crime.hvplot(x='Year', y='Violent Crime rate') * crime.hvplot.scatter(x='Year', y='Violent Crime rate', c='k')" + "stocks_subset.hvplot(x='date', y='close') * stocks_subset.hvplot.scatter(x='date', y='close', c='k')" ] }, { @@ -643,8 +654,8 @@ "metadata": {}, "outputs": [], "source": [ - "(crime.hvplot.bar(x='Year', y='Violent Crime rate', rot=90, width=550) +\n", - " crime.hvplot.table(['Year', 'Population', 'Violent Crime rate'], width=420))" + "(stocks_subset.hvplot.bar(x='date', y='close', rot=90, width=550) +\n", + " stocks_subset.hvplot.table(['date', 'close', 'volume'], width=420))" ] }, { @@ -653,7 +664,7 @@ "source": [ "## Large data\n", "\n", - "The previous examples summarized the fairly large airline dataset using statistical plot types that aggregate the data into a feasible subset for plotting. We can instead aggregate the data directly into the viewable image using [datashader](https://datashader.org), which provides a rendering of the entire set of raw data available (as far as the resolution of the screen allows). Here we plot the 'airtime' against the 'distance':" + "The previous examples summarized the fairly large clusters dataset using statistical plot types that aggregate the data into a feasible subset for plotting. We can instead aggregate the data directly into the viewable image using [datashader](https://datashader.org), which provides a rendering of the entire set of raw data available (as far as the resolution of the screen allows). Here we plot the 'x' against the 'y' coordinates of the clusters:" ] }, { @@ -662,7 +673,7 @@ "metadata": {}, "outputs": [], "source": [ - "flights.hvplot.scatter(x='distance', y='airtime', datashade=True)" + "clusters.hvplot.scatter(x='x', y='y', by='cat', datashade=True)" ] }, { @@ -671,7 +682,7 @@ "source": [ "## Groupby\n", "\n", - "Thanks to the ability of HoloViews to explore a parameter space with a set of widgets we can apply a groupby along a particular column or dimension. For example we can view the distribution of departure delays by carrier grouped by day, allowing the user to choose which day to display:" + "Thanks to the ability of HoloViews to explore a parameter space with a set of widgets we can apply a groupby along a particular column or dimension. For example we can view the distribution of the penguins body mass by species grouped by year, allowing the user to choose which value group to display:" ] }, { @@ -680,7 +691,7 @@ "metadata": {}, "outputs": [], "source": [ - "flights.hvplot.violin(y='depdelay', by='carrier', groupby='dayofweek', ylim=(-20, 60), height=500)" + "penguins.hvplot.violin(y='body_mass_g', by='species', groupby='year', height=300)" ] }, { diff --git a/doc/user_guide/Subplots.ipynb b/doc/user_guide/Subplots.ipynb index 898c3512e..9ef1e9bcf 100644 --- a/doc/user_guide/Subplots.ipynb +++ b/doc/user_guide/Subplots.ipynb @@ -26,9 +26,12 @@ "import hvplot.pandas # noqa\n", "import hvplot.xarray # noqa\n", "\n", - "from hvplot.sample_data import airline_flights, us_crime\n", + "from hvplot.sampledata import penguins, stocks\n", "\n", - "us_crime.hvplot(x='Year', y=['Burglary rate', 'Violent Crime rate', 'Robbery rate'], value_label='Rate')" + "stocks = stocks('pandas')\n", + "penguins = penguins('pandas')\n", + "\n", + "stocks.hvplot(x='date', y=['Apple', 'Amazon', 'Google'], value_label='Stock Prices')" ] }, { @@ -44,8 +47,7 @@ "metadata": {}, "outputs": [], "source": [ - "us_crime.hvplot(x='Year', y=['Burglary rate', 'Violent Crime rate', 'Robbery rate'],\n", - " value_label='Rate', subplots=True, width=300, height=200)" + "stocks.hvplot(x='date', y=['Apple', 'Amazon', 'Google'], value_label='Stock Prices', subplots=True, width=300, height=200)" ] }, { @@ -63,8 +65,8 @@ "metadata": {}, "outputs": [], "source": [ - "us_crime.hvplot(x='Year', y=['Robbery', 'Robbery rate', 'Burglary', 'Burglary rate'], \n", - " width=350, height=300, subplots=True, shared_axes=False).cols(2)" + "penguins.hvplot.scatter(x='bill_length_mm', y=['bill_depth_mm', 'flipper_length_mm', 'body_mass_g'],\n", + " width=300, height=300, subplots=True, shared_axes=False).cols(2)" ] }, { @@ -82,12 +84,8 @@ "metadata": {}, "outputs": [], "source": [ - "flights = airline_flights.read()\n", - "\n", - "flight_subset = flights[flights.carrier.isin(['OH', 'F9', 'US'])].sample(2000)\n", - "\n", - "flight_subset.hvplot.scatter(x='arrdelay', y='depdelay', by='carrier',\n", - " subplots=True, width=250, height=250, alpha=0.1)" + "penguins.hvplot.scatter(x='bill_length_mm', y='bill_depth_mm', by='species',\n", + " subplots=True, width=250, height=250, alpha=0.5)" ] }, { @@ -106,8 +104,8 @@ "metadata": {}, "outputs": [], "source": [ - "flight_subset.sort_values('dayofweek').hvplot.scatter(x='arrdelay', y='depdelay', \n", - " row='dayofweek', col='carrier', alpha=0.2)" + "penguins.hvplot.scatter(x='bill_length_mm', y='bill_depth_mm',\n", + " row='island', col='species', alpha=0.5)" ] }, { diff --git a/doc/user_guide/Viewing.ipynb b/doc/user_guide/Viewing.ipynb index 085fed95d..8f62c5058 100644 --- a/doc/user_guide/Viewing.ipynb +++ b/doc/user_guide/Viewing.ipynb @@ -49,12 +49,15 @@ "metadata": {}, "outputs": [], "source": [ - "from hvplot.sample_data import airline_flights, us_crime\n", + "from hvplot.sampledata import penguins, stocks\n", "\n", - "violent_crime = us_crime.hvplot(x='Year', y='Violent Crime rate', width=400)\n", - "burglaries = us_crime.hvplot(x='Year', y='Burglary rate', width=400)\n", + "penguins = penguins('pandas')\n", + "stocks = stocks('pandas')\n", "\n", - "violent_crime + burglaries" + "apple_stocks = stocks.hvplot(x='date', y='Apple', width=400, ylabel='Apple stock prices')\n", + "netflix_stocks = stocks.hvplot(x='date', y='Netflix', width=400, ylabel='Netflix stock prices')\n", + "\n", + "apple_stocks + netflix_stocks" ] }, { @@ -74,7 +77,7 @@ "source": [ "import panel as pn\n", "\n", - "pane = pn.panel(violent_crime)\n", + "pane = pn.panel(apple_stocks)\n", "pane" ] }, @@ -91,7 +94,7 @@ "metadata": {}, "outputs": [], "source": [ - "pane.object = burglaries" + "pane.object = netflix_stocks" ] }, { @@ -107,7 +110,7 @@ "metadata": {}, "outputs": [], "source": [ - "pane.object *= violent_crime " + "pane.object *= apple_stocks" ] }, { @@ -142,7 +145,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot = airline_flights.hvplot.hexbin(x='airtime', y='arrdelay', colorbar=True, width=600, height=500, logz=True)\n", + "plot = penguins.hvplot.hexbin(x='bill_length_mm', y='bill_depth_mm')\n", "\n", "hvplot.save(plot, 'test.html')" ] diff --git a/hvplot/sample_data.py b/hvplot/sample_data.py index 54a7e8ca6..cb9658bc0 100644 --- a/hvplot/sample_data.py +++ b/hvplot/sample_data.py @@ -1,30 +1,30 @@ """ -Loads hvPlot sample data using intake catalogue. +Deprecated. Use ``hvplot.sampledata`` instead. + +This module previously loaded hvPlot sample data using an intake catalogue. +It has been replaced by ``hvplot.sampledata``, which uses the ``hvsampledata`` +package and does not require intake. """ -import os +import warnings + +from .util import _find_stack_level +from . import sampledata as _sampledata -try: - from intake import open_catalog - import intake_parquet # noqa - import intake_xarray # noqa - import s3fs # noqa -except ImportError: - raise ImportError( - """Loading hvPlot sample data requires: - * intake - * intake-parquet - * intake-xarray - * s3fs - Install these using conda or pip before loading data.""" - ) +warnings.warn( + "The 'hvplot.sample_data' module is deprecated and will be removed in a " + "future version. Use 'hvplot.sampledata' instead.", + FutureWarning, + stacklevel=_find_stack_level(), +) -_file_path = os.path.dirname(__file__) -_cat_path = os.path.join(_file_path, 'datasets.yaml') +from .sampledata import * # noqa: F401, F403, E402 -# Load catalogue -catalogue = open_catalog(_cat_path) -# Add catalogue entries to namespace -for _c in catalogue: - globals()[_c] = catalogue[_c] +def __getattr__(name): + if not _sampledata._hvsampledata_available: + raise AttributeError( + "Install the package 'hvsampledata' to access datasets from " + "'hvplot.sample_data' (deprecated; use 'hvplot.sampledata' instead)." + ) + raise AttributeError(f"module 'hvplot.sample_data' has no attribute {name!r}") diff --git a/hvplot/tests/testdeprecations.py b/hvplot/tests/testdeprecations.py index 98e32afee..05e401d93 100644 --- a/hvplot/tests/testdeprecations.py +++ b/hvplot/tests/testdeprecations.py @@ -2,6 +2,8 @@ Tests for deprecation warnings. """ +import sys +import importlib import pandas as pd import pytest @@ -28,3 +30,9 @@ def test_converter_argument_hover_formatters(): df = pd.DataFrame({'x': [0, 1], 'y': [0, 1]}) with pytest.warns(DeprecationWarning): HoloViewsConverter(df, 'x', 'y', hover_formatters={'@{y}': 'printf'}) + + +def test_sample_data_deprecation(): + with pytest.warns(FutureWarning): + importlib.import_module('hvplot.sample_data') + sys.modules.pop('hvplot.sample_data', None)