get_ExoplanetEU_data.py 2.61 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from PyAstronomy import pyasl
import pandas as pd
import numpy as np

from sklearn.neighbors import KernelDensity
import astropy.units as u
from astropy import constants as const


def get_ExoplanetEU_data():
    """use PyAstronomy to access the data provided by exoplanet.eu
    https://pyastronomy.readthedocs.io/en/latest/pyaslDoc/resBasedDoc/exoplanetEU.html
    
    Return: dataframe, array for period, radius and distance
    """

    # Instantiate exoplanetEU2 object (download all the planets)
    v = pyasl.ExoplanetEU2()
    # Export all data as a pandas DataFrame
    df_planets = v.getAllDataPandas()

    mask_nan = ~(np.isnan(df_planets["orbital_period"]) | np.isnan(df_planets["radius"]))
    mask_P = df_planets["orbital_period"]<300
    df_planets_ = df_planets[mask_nan & mask_P] # dataframe only with planets which have P & R

    RJ_to_RE = (const.R_jup/const.R_earth).value
    period = df_planets_["orbital_period"].values
    radius = df_planets_["radius"].values*RJ_to_RE
    distance = df_planets_["semi_major_axis"].values
    
    return df_planets_, period, radius, distance
    
def gaussian_kernel_density_estimation(X_data, Y_data, baseX, baseY):
    """
    Input:
    @ X_data
    @ Y_data
    @ baseX
    @ baseY
    
    Return: X_arr and Y_arr for the grid, and the corresponding probability density grid, not in log-scale!
    
    Plot command: ax.contourf(X_grid, Y_grid, prob_density, cmap="binary", levels=15, alpha=0.5)
    """
    # transform actual data to log space before proceeding
    baseX_data = np.log(X_data)/np.log(baseX)
    baseY_data = np.log(Y_data)/np.log(baseY)

    # make a grid (in log space - to match the data)
    X = np.arange(baseX_data.min(),baseX_data.max()+1,1)
    Y = np.arange(baseY_data.min(),baseY_data.max()+1,1)
    # for now the density of the grid is fixed in here/ hardcoded!
    XX, YY = np.mgrid[X.min():X.max():200j, Y.min():Y.max():200j] 

    XY_sample = np.vstack([XX.ravel(), YY.ravel()]).T # now I have a grid of points covering my radii and periods
    XY_train  = np.vstack([baseX_data, baseY_data]).T # this is my data in grid-form

    # construct a Gaussian kernel density estimate of the distribution
    # for now the bandwidth is also hardcoded!
    kde = KernelDensity(bandwidth=0.25, kernel='gaussian')
    kde.fit(XY_train) # fit/load real dataset

    # score_samples() returns the log-likelihood of the samples
    prob_density = np.exp(kde.score_samples(XY_sample))
    # nm_sample is the range/grid over which I compute the density based on the data (2-D)
    prob_density = np.reshape(prob_density, XX.shape)
    
    return baseX**XX, baseY**YY, prob_density