When you are working with data in Python make sure that you have it on form of CSV(Comma Separated Values). This formate is the common way of storing data for ML. There are some other format that can be used but not common like YML,JSON,XML,SQL, and others.
- Read or load the data into python:
- Get some information about your data
- Removing or replacing
- Printing some interesting facts about your data.
- Plotting distribution of a column
- Apply filter and fitch data match that filter
import pandas as pd
import numpy as np
data = pd.read_csv('latest.csv') # This will load the data to variable data.
# You can remove the header of the by header=False option
# to print a sample of the data do
data.head()
# Show information
data.info()
# Describe your data
data.describe()
# Show the data types of the columns
data.dtypes
# Find the null values
data.isnull().sum()
# Find the duplicat value
data.duplicated().sum()
# Check the size of your data, how many samples or raws you have.
len(data)
# This will drop all the Null or Nan values on all the columns and raw.
# This is good way if you a large dataset and the Null values are small set.
data.dropna(inplace=True)
# Droping columns in the table.
columns_to_drop = ['column1', 'column2']
# Use the drop method to remove the specified columns
data = data.drop(columns=columns_to_drop)
# Plotting frequencies of categorical variable.
category_frequencies = data['markets'].value_counts()
# Find the number of values with a frequency of one
values_with_frequency_one = (category_frequencies == 1).sum()
print(f"Number of values with a frequency of one: {values_with_frequency_one}")
import seaborn as sns
import matplotlib.pyplot as plt
# Assuming 'data' is your DataFrame
plt.figure(figsize=(15, 6)) # Adjust the figure size as needed
# Get the top 30 markets by count
top_30_markets = data['markets'].value_counts().head(30)
print(f"The top 30 markets out of {data['markets'].value_counts().size} market: \n {top_30_markets}")
top_20_markets = data['markets'].value_counts().head(20)
# Create a barplot of the top 20 markets
plt.bar(top_20_markets.index, top_20_markets.values,color='cornflowerblue',width=0.8)
# Rotate x-axis labels for better readability
plt.xticks(rotation=90)
# Add labels and title
plt.xlabel('Market')
plt.ylabel('Count')
plt.title('Top 20 Markets by Count')
# Add percentage labels on top of each bar
total_count = len(data['markets'])
for i, count in enumerate(top_20_markets.values):
percentage = (count / total_count) * 100
plt.annotate(f'{percentage:.2f}%', (i, count), ha='center', va='bottom')
# Show the plot
plt.tight_layout()
plt.savefig('Market_distribution.jpg')
plt.show()
# You have to make sure that your year feature is on the right format by adjusting it.
# This is the accepted format by pandas for date time
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d %H:%M:%S')
# Now apply the filter (condition 1) & (condition 2) & ...
data = data[(data['date'].dt.year <= 2021) & (data['date'].dt.year >= 2010)]