from PyAstronomy import pyasl import pandas as pd import numpy as np from sklearn.neighbors import KernelDensity import astropy.units as u from astropy import constants as const def get_ExoplanetEU_data(): """use PyAstronomy to access the data provided by exoplanet.eu https://pyastronomy.readthedocs.io/en/latest/pyaslDoc/resBasedDoc/exoplanetEU.html Return: dataframe, array for period, radius and distance """ # Instantiate exoplanetEU2 object (download all the planets) v = pyasl.ExoplanetEU2() # Export all data as a pandas DataFrame df_planets = v.getAllDataPandas() mask_nan = ~(np.isnan(df_planets["orbital_period"]) | np.isnan(df_planets["radius"])) mask_P = df_planets["orbital_period"]<300 df_planets_ = df_planets[mask_nan & mask_P] # dataframe only with planets which have P & R RJ_to_RE = (const.R_jup/const.R_earth).value period = df_planets_["orbital_period"].values radius = df_planets_["radius"].values*RJ_to_RE distance = df_planets_["semi_major_axis"].values return df_planets_, period, radius, distance def gaussian_kernel_density_estimation(X_data, Y_data, baseX, baseY): """ Input: @ X_data @ Y_data @ baseX @ baseY Return: X_arr and Y_arr for the grid, and the corresponding probability density grid, not in log-scale! Plot command: ax.contourf(X_grid, Y_grid, prob_density, cmap="binary", levels=15, alpha=0.5) """ # transform actual data to log space before proceeding baseX_data = np.log(X_data)/np.log(baseX) baseY_data = np.log(Y_data)/np.log(baseY) # make a grid (in log space - to match the data) X = np.arange(baseX_data.min(),baseX_data.max()+1,1) Y = np.arange(baseY_data.min(),baseY_data.max()+1,1) # for now the density of the grid is fixed in here/ hardcoded! XX, YY = np.mgrid[X.min():X.max():200j, Y.min():Y.max():200j] XY_sample = np.vstack([XX.ravel(), YY.ravel()]).T # now I have a grid of points covering my radii and periods XY_train = np.vstack([baseX_data, baseY_data]).T # this is my data in grid-form # construct a Gaussian kernel density estimate of the distribution # for now the bandwidth is also hardcoded! kde = KernelDensity(bandwidth=0.25, kernel='gaussian') kde.fit(XY_train) # fit/load real dataset # score_samples() returns the log-likelihood of the samples prob_density = np.exp(kde.score_samples(XY_sample)) # nm_sample is the range/grid over which I compute the density based on the data (2-D) prob_density = np.reshape(prob_density, XX.shape) return baseX**XX, baseY**YY, prob_density