Building Your Data Science Foundation
ISM6251 | Week 2
Python • NumPy • Pandas • Matplotlib
By the end of this week, you will be able to:
# Python is dynamically typed
age = 25 # Integer
price = 19.99 # Float
name = "Alice" # String
is_student = True # Boolean
# Type checking
print(type(age))
# Output: <class 'int'>
print(type(price))
# Output: <class 'float'>
print(type(name))
# Output: <class 'str'>
# Type conversion
str_num = "42"
int_num = int(str_num)
float_num = float(str_num)
# Multiple assignment
x, y, z = 1, 2, 3
a = b = c = 0
a = 10
b = 3
print(a + b) # 13
print(a - b) # 7
print(a * b) # 30
print(a / b) # 3.333...
print(a // b) # 3 (floor)
print(a % b) # 1 (remainder)
print(a ** b) # 1000 (power)
x = 5
y = 10
print(x < y) # True
print(x <= y) # True
print(x > y) # False
print(x >= y) # False
print(x == y) # False
print(x != y) # True
# String creation
greeting = "Hello"
name = 'World'
message = greeting + " " + name
print(message) # Hello World
# String formatting
age = 25
height = 5.9
info = f"{name} is {age} years old"
print(info)
# String methods
text = " Python Programming "
print(text.strip()) # Remove whitespace
print(text.upper()) # PYTHON PROGRAMMING
print(text.lower()) # python programming
print(text.replace("Python", "Java"))
# String slicing
word = "Python"
print(word[0]) # P
print(word[-1]) # n
print(word[0:3]) # Pyt
print(word[::2]) # Pto
print(word[::-1]) # nohtyP (reverse)
# If-elif-else
score = 85
if score >= 90:
grade = 'A'
elif score >= 80:
grade = 'B'
elif score >= 70:
grade = 'C'
else:
grade = 'F'
print(f"Grade: {grade}") # Grade: B
# For loop
fruits = ['apple', 'banana', 'orange']
for fruit in fruits:
print(f"I like {fruit}")
# For with range
for i in range(5):
print(i, end=' ') # 0 1 2 3 4
# While loop
count = 0
while count < 3:
print(f"Count: {count}")
count += 1
# Break and continue
for i in range(10):
if i == 7:
break
if i % 2 == 0:
continue
print(i, end=' ')
# Output: 1 3 5
# Basic function
def greet(name):
"""Simple greeting function"""
return f"Hello, {name}!"
print(greet("Alice")) # Hello, Alice!
# Default parameters
def power(base, exponent=2):
return base ** exponent
print(power(3)) # 9 (3^2)
print(power(3, 3)) # 27 (3^3)
# Multiple return values
def calculate_stats(numbers):
return (
min(numbers),
max(numbers),
sum(numbers)/len(numbers)
)
min_val, max_val, avg = calculate_stats([1,2,3,4,5])
print(f"Min: {min_val}, Max: {max_val}, Avg: {avg}")
# Lambda functions
square = lambda x: x**2
numbers = [1, 2, 3, 4, 5]
squared = list(map(lambda x: x**2, numbers))
print(squared) # [1, 4, 9, 16, 25]
# Filter with lambda
evens = list(filter(lambda x: x%2==0, numbers))
print(evens) # [2, 4]
# *args: Variable positional arguments
def print_scores(*args):
"""Accept any number of scores"""
if not args:
print("No scores provided")
return
for i, score in enumerate(args, 1):
print(f"Score {i}: {score}")
print(f"Average: {sum(args)/len(args):.1f}")
print_scores(85, 90, 78, 92, 88)
# Score 1: 85
# Score 2: 90...
# Average: 86.6
# **kwargs: Variable keyword arguments
def create_profile(**kwargs):
"""Create profile from keyword args"""
profile = {}
for key, value in kwargs.items():
profile[key] = value
return profile
user = create_profile(
name="Alice", age=25,
city="NYC", role="Developer"
)
# Combining all types
def process(op, *values, **options):
print(f"Operation: {op}")
print(f"Values: {values}")
print(f"Options: {options}")
process("sum", 1, 2, 3, verbose=True)
# ml_utils.py
"""Utility functions for ML tasks"""
def normalize(data):
"""Min-max normalization"""
min_val = min(data)
max_val = max(data)
return [(x - min_val) / (max_val - min_val)
for x in data]
def train_test_split(data, test_size=0.2):
"""Simple train-test split"""
split_idx = int(len(data) * (1 - test_size))
return data[:split_idx], data[split_idx:]
def accuracy(y_true, y_pred):
"""Calculate accuracy"""
correct = sum(1 for t, p in zip(y_true, y_pred)
if t == p)
return correct / len(y_true)
# In your notebook or script
import ml_utils
# Use the functions
data = [10, 20, 30, 40, 50]
normalized = ml_utils.normalize(data)
print(normalized)
# Or import specific functions
from ml_utils import train_test_split
train, test = train_test_split(data)
print(f"Train: {train}")
print(f"Test: {test}")
# Import with alias
import ml_utils as utils
result = utils.accuracy([1,0,1,1], [1,0,0,1])
print(f"Accuracy: {result:.2%}")
Best Practice: Organize reusable functions in .py files for clean, maintainable code
# Creating lists
numbers = [1, 2, 3, 4, 5]
mixed = [1, "hello", 3.14, True]
nested = [[1, 2], [3, 4], [5, 6]]
# List operations
fruits = ['apple', 'banana']
fruits.append('orange') # Add to end
fruits.insert(1, 'grape') # Insert at index
print(fruits)
# ['apple', 'grape', 'banana', 'orange']
# Remove items
fruits.remove('grape') # Remove by value
last = fruits.pop() # Remove & return last
print(fruits) # ['apple', 'banana']
# List slicing
nums = [0, 1, 2, 3, 4, 5]
print(nums[2:5]) # [2, 3, 4]
print(nums[::2]) # [0, 2, 4] (every 2nd)
print(nums[::-1]) # [5, 4, 3, 2, 1, 0]
# List methods
scores = [85, 92, 78, 95, 88]
scores.sort()
print(scores) # [78, 85, 88, 92, 95]
print(f"Max: {max(scores)}, Min: {min(scores)}")
print(f"Sum: {sum(scores)}, Avg: {sum(scores)/len(scores)}")
# Creating tuples
point = (3, 4)
rgb = (255, 128, 0)
single = (42,) # Note comma for single item
# Tuple unpacking
x, y = point
print(f"x: {x}, y: {y}") # x: 3, y: 4
# Multiple assignment
def get_min_max(numbers):
return min(numbers), max(numbers)
minimum, maximum = get_min_max([1, 2, 3, 4, 5])
print(f"Min: {minimum}, Max: {maximum}")
# Tuples are immutable
# point[0] = 5 # This would raise TypeError
# Named tuples (more readable)
from collections import namedtuple
Person = namedtuple('Person', ['name', 'age', 'city'])
alice = Person('Alice', 30, 'NYC')
print(alice.name) # Alice
print(alice.age) # 30
# Tuple as dictionary key (lists can't do this)
locations = {
(40.7128, -74.0060): "New York",
(51.5074, -0.1278): "London"
}
# Creating dictionaries
person = {
'name': 'Alice',
'age': 30,
'city': 'NYC'
}
# Alternative creation
scores = dict(math=90, english=85, science=92)
# Accessing values
print(person['name']) # Alice
print(person.get('age')) # 30
print(person.get('job', 'Unknown')) # Unknown
# Adding/updating
person['job'] = 'Developer'
person['age'] = 31
# Dictionary methods
print(person.keys()) # dict_keys(['name', 'age', 'city', 'job'])
print(person.values()) # dict_values(['Alice', 31, 'NYC', 'Developer'])
# Iterating
for key, value in person.items():
print(f"{key}: {value}")
# Nested dictionaries
data = {
'user1': {'name': 'Alice', 'age': 30},
'user2': {'name': 'Bob', 'age': 25}
}
print(data['user1']['name']) # Alice
# Immutable example
x = 5
y = x
x = 10
print(f"x: {x}, y: {y}") # x: 10, y: 5
# String (immutable)
s1 = "hello"
s2 = s1
s1 = s1.upper()
print(f"s1: {s1}, s2: {s2}")
# s1: HELLO, s2: hello
# Mutable example - aliasing
list1 = [1, 2, 3]
list2 = list1 # Both point to same list
list1.append(4)
print(f"list1: {list1}") # [1, 2, 3, 4]
print(f"list2: {list2}") # [1, 2, 3, 4] - also changed!
# Avoid aliasing with copy
list3 = [1, 2, 3]
list4 = list3.copy() # or list3[:]
list3.append(4)
print(f"list3: {list3}") # [1, 2, 3, 4]
print(f"list4: {list4}") # [1, 2, 3] - unchanged
# Function arguments
def modify_list(lst):
lst.append(100) # Modifies original!
my_list = [1, 2, 3]
modify_list(my_list)
print(my_list) # [1, 2, 3, 100]
# List of dictionaries (like database records)
students = [
{'name': 'Alice', 'age': 20, 'grade': 85},
{'name': 'Bob', 'age': 21, 'grade': 92},
{'name': 'Charlie', 'age': 19, 'grade': 78}
]
# Access and modify
print(students[0]['name']) # Alice
students[1]['grade'] = 95
# Add new student
students.append({'name': 'Diana', 'age': 22, 'grade': 88})
# Find students with grade > 80
high_performers = [s for s in students if s['grade'] > 80]
print(f"High performers: {len(high_performers)}")
# Average grade
avg_grade = sum(s['grade'] for s in students) / len(students)
print(f"Average grade: {avg_grade:.1f}")
# Dictionary of lists (grouped data)
courses = {
'Math101': ['Alice', 'Bob', 'Charlie'],
'CS201': ['Bob', 'Diana'],
'Eng301': ['Alice', 'Charlie', 'Eve']
}
# Add student to course
courses['Math101'].append('Frank')
# Find all courses for a student
student = 'Alice'
alice_courses = [course for course, students
in courses.items()
if student in students]
print(f"{student}'s courses: {alice_courses}")
# Alice's courses: ['Math101', 'Eng301']
# Complex nested structure
company = {
'Engineering': [
{'name': 'Alice', 'salary': 120000},
{'name': 'Bob', 'salary': 105000}
],
'Sales': [
{'name': 'Charlie', 'salary': 85000},
{'name': 'Diana', 'salary': 92000}
]
}
# Calculate department averages
for dept, employees in company.items():
avg_salary = sum(e['salary'] for e in employees) / len(employees)
print(f"{dept}: ${avg_salary:,.0f}")
# Engineering: $112,500
# Sales: $88,500
# Regular function for complex processing
def process_student_data(students):
"""Calculate statistics for student records"""
if not students:
return {}
grades = [s['grade'] for s in students]
return {
'count': len(students),
'average': sum(grades) / len(grades),
'highest': max(grades),
'lowest': min(grades),
'passing': len([g for g in grades if g >= 70])
}
students = [
{'name': 'Alice', 'grade': 85},
{'name': 'Bob', 'grade': 92},
{'name': 'Charlie', 'grade': 78},
{'name': 'Diana', 'grade': 65}
]
stats = process_student_data(students)
print(f"Class average: {stats['average']:.1f}")
print(f"Passing students: {stats['passing']}/{stats['count']}")
# Lambda function for sorting complex structures
students_sorted = sorted(students,
key=lambda s: s['grade'],
reverse=True)
print(f"Top student: {students_sorted[0]['name']}")
# Using map with lambda on nested data
student_names = list(map(lambda s: s['name'].upper(),
students))
print(student_names) # ['ALICE', 'BOB', 'CHARLIE', 'DIANA']
# Basic list comprehension
squares = [x**2 for x in range(10)]
print(squares)
# [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
# With condition
evens = [x for x in range(20) if x % 2 == 0]
print(evens)
# [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
# Multiple conditions
filtered = [
x for x in range(100)
if x % 2 == 0 and x % 3 == 0
]
# String operations
words = ['hello', 'world', 'python']
upper = [w.upper() for w in words]
# ['HELLO', 'WORLD', 'PYTHON']
# Nested comprehension
matrix = [
[i+j for j in range(3)]
for i in range(3)
]
# [[0,1,2], [1,2,3], [2,3,4]]
# Flatten nested list
nested = [[1,2], [3,4], [5,6]]
flat = [x for sublist in nested for x in sublist]
# [1, 2, 3, 4, 5, 6]
# Regular function in comprehension
def is_valid_score(score):
"""Check if score is valid (0-100)"""
return 0 <= score <= 100
scores = [85, 92, -5, 78, 105, 88, 95]
valid_scores = [s for s in scores if is_valid_score(s)]
print(valid_scores) # [85, 92, 78, 88, 95]
# Lambda in comprehension
numbers = [1, 2, 3, 4, 5]
transformed = [(lambda x: x**2 + 2*x + 1)(n) for n in numbers]
print(transformed) # [4, 9, 16, 25, 36]
# Equivalent using map with lambda
squared_plus = list(map(lambda x: x**2 + 2*x + 1, numbers))
print(squared_plus) # [4, 9, 16, 25, 36]
# Complex example: process student records
students = [
{'name': 'Alice', 'scores': [85, 90, 88]},
{'name': 'Bob', 'scores': [78, 82, 85]},
{'name': 'Charlie', 'scores': [92, 95, 90]}
]
# Calculate averages using regular function
def calculate_avg(scores):
return sum(scores) / len(scores)
averages = [
{'name': s['name'], 'avg': calculate_avg(s['scores'])}
for s in students
]
# Same with lambda (less readable for complex logic)
averages_lambda = [
{'name': s['name'],
'avg': (lambda scores: sum(scores)/len(scores))(s['scores'])}
for s in students
]
# Basic dictionary comprehension
squares_dict = {x: x**2 for x in range(5)}
print(squares_dict)
# {0: 0, 1: 1, 2: 4, 3: 9, 4: 16}
# From two lists
names = ['Alice', 'Bob', 'Charlie']
ages = [25, 30, 35]
people = {
name: age
for name, age in zip(names, ages)
}
# {'Alice': 25, 'Bob': 30, 'Charlie': 35}
# With condition
adults = {
name: age
for name, age in people.items()
if age >= 30
}
# {'Bob': 30, 'Charlie': 35}
# Invert dictionary
inverted = {v: k for k, v in people.items()}
# {25: 'Alice', 30: 'Bob', 35: 'Charlie'}
# Transform values
ages_in_months = {
name: age * 12
for name, age in people.items()
}
# Regular function for value processing
def calculate_tax(salary):
"""Calculate tax based on salary"""
if salary < 50000:
return salary * 0.10
elif salary < 100000:
return salary * 0.20
else:
return salary * 0.30
employees = {
'Alice': 45000,
'Bob': 75000,
'Charlie': 120000,
'Diana': 55000
}
# Dictionary comprehension with regular function
taxes = {
name: calculate_tax(salary)
for name, salary in employees.items()
}
print(taxes)
# {'Alice': 4500.0, 'Bob': 15000.0,
# 'Charlie': 36000.0, 'Diana': 11000.0}
# Lambda for simple transformations
# Convert to after-tax income
after_tax = {
name: salary - (lambda s: s * 0.2)(salary)
for name, salary in employees.items()
}
# Process with multiple functions
def get_level(salary):
return 'Senior' if salary > 80000 else 'Junior'
employee_info = {
name: {
'salary': salary,
'tax': calculate_tax(salary),
'level': get_level(salary),
'monthly': salary / 12
}
for name, salary in employees.items()
}
# Filter using lambda
high_earners = {
name: salary
for name, salary in employees.items()
if (lambda s: s > 70000)(salary)
}
import numpy as np
# From list
arr1 = np.array([1, 2, 3, 4, 5])
print(arr1) # [1 2 3 4 5]
# 2D array (matrix)
matrix = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
print(matrix.shape) # (3, 3)
# Initialize arrays
zeros = np.zeros((3, 4)) # 3x4 matrix of zeros
ones = np.ones((2, 3)) # 2x3 matrix of ones
full = np.full((3, 3), 7) # 3x3 filled with 7
identity = np.eye(4) # 4x4 identity matrix
# Sequences
range_arr = np.arange(0, 10, 2) # [0 2 4 6 8]
linear = np.linspace(0, 1, 5) # [0. 0.25 0.5 0.75 1.]
# Random arrays
random_uniform = np.random.rand(3, 3) # Uniform [0,1)
random_normal = np.random.randn(3, 3) # Normal(0,1)
random_int = np.random.randint(0, 10, size=(3, 3))
# Array properties
print(f"Shape: {matrix.shape}")
print(f"Data type: {matrix.dtype}")
print(f"Size: {matrix.size}")
print(f"Dimensions: {matrix.ndim}")
# 1D array indexing
arr = np.array([10, 20, 30, 40, 50])
print(arr[0]) # 10
print(arr[-1]) # 50
print(arr[1:4]) # [20 30 40]
# 2D array indexing
matrix = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
print(matrix[0, 0]) # 1 (first element)
print(matrix[1, :]) # [4 5 6] (second row)
print(matrix[:, 2]) # [3 6 9] (third column)
print(matrix[0:2, 1:3]) # [[2 3], [5 6]]
# Boolean indexing
arr = np.array([1, 2, 3, 4, 5])
mask = arr > 3
print(mask) # [False False False True True]
print(arr[mask]) # [4 5]
# Direct boolean indexing
matrix = np.random.randint(0, 10, (4, 4))
print(matrix[matrix > 5]) # All elements > 5
# Fancy indexing
arr = np.array([10, 20, 30, 40, 50])
indices = [0, 2, 4]
print(arr[indices]) # [10 30 50]
# Modify using indexing
matrix[0, 0] = 100
matrix[matrix < 5] = 0 # Set all values < 5 to 0
# Element-wise operations
a = np.array([1, 2, 3, 4])
b = np.array([10, 20, 30, 40])
print(a + b) # [11 22 33 44]
print(a * b) # [10 40 90 160]
print(b / a) # [10. 10. 10. 10.]
print(a ** 2) # [1 4 9 16]
# Broadcasting with scalar
arr = np.array([[1, 2, 3],
[4, 5, 6]])
print(arr * 2) # All elements multiplied by 2
# Broadcasting with different shapes
row = np.array([1, 2, 3]) # Shape: (3,)
col = np.array([[10], [20]]) # Shape: (2, 1)
result = row + col # Broadcasting!
# [[11 12 13],
# [21 22 23]]
# Aggregations
matrix = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
print(matrix.sum()) # 45 (all elements)
print(matrix.sum(axis=0)) # [12 15 18] (column sums)
print(matrix.sum(axis=1)) # [6 15 24] (row sums)
print(matrix.mean()) # 5.0
print(matrix.std()) # 2.58...
# Matrix operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
print(np.dot(A, B)) # Matrix multiplication
print(A.T) # Transpose
# Generate sample data
np.random.seed(42)
data = np.random.randn(1000) * 15 + 100
# Basic statistics
print(f"Mean: {data.mean():.2f}")
print(f"Median: {np.median(data):.2f}")
print(f"Std Dev: {data.std():.2f}")
print(f"Variance: {data.var():.2f}")
# Min/Max and their indices
print(f"Min: {data.min():.2f}")
print(f"Max: {data.max():.2f}")
print(f"Index of min: {data.argmin()}")
print(f"Index of max: {data.argmax()}")
# Percentiles
print(f"25th percentile: {np.percentile(data, 25):.2f}")
print(f"75th percentile: {np.percentile(data, 75):.2f}")
# 2D array statistics
matrix = np.random.randint(0, 100, (4, 5))
print(matrix)
# Statistics along axes
print(f"Column means: {matrix.mean(axis=0)}")
print(f"Row means: {matrix.mean(axis=1)}")
print(f"Column max: {matrix.max(axis=0)}")
# Correlation
x = np.random.randn(100)
y = 2 * x + np.random.randn(100) * 0.5
correlation = np.corrcoef(x, y)[0, 1]
print(f"Correlation: {correlation:.3f}")
import time
# Python list operation
def python_sum_squares(n):
"""Sum of squares using Python list"""
numbers = list(range(n))
result = []
for num in numbers:
result.append(num ** 2)
return sum(result)
# NumPy operation
def numpy_sum_squares(n):
"""Sum of squares using NumPy"""
numbers = np.arange(n)
return (numbers ** 2).sum()
# Timing comparison
n = 1000000
start = time.time()
python_result = python_sum_squares(n)
python_time = time.time() - start
start = time.time()
numpy_result = numpy_sum_squares(n)
numpy_time = time.time() - start
print(f"Python time: {python_time:.4f} seconds")
print(f"NumPy time: {numpy_time:.4f} seconds")
print(f"NumPy is {python_time/numpy_time:.1f}x faster")
# Output (typical):
# Python time: 0.1234 seconds
# NumPy time: 0.0023 seconds
# NumPy is 53.7x faster
# Memory efficiency
import sys
py_list = list(range(1000))
np_array = np.arange(1000)
print(f"Python list size: {sys.getsizeof(py_list)} bytes")
print(f"NumPy array size: {np_array.nbytes} bytes")
import pandas as pd
# From dictionary
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'Diana'],
'Age': [25, 30, 35, 28],
'City': ['NYC', 'Paris', 'London', 'Tokyo'],
'Salary': [70000, 80000, 75000, 90000]
}
df = pd.DataFrame(data)
print(df)
# Output:
# Name Age City Salary
# 0 Alice 25 NYC 70000
# 1 Bob 30 Paris 80000
# 2 Charlie 35 London 75000
# 3 Diana 28 Tokyo 90000
# DataFrame info
print(df.shape)
# Output: (4, 4)
print(df.columns.tolist())
# Output: ['Name', 'Age', 'City', 'Salary']
print(df.dtypes)
# Output:
# Name object
# Age int64
# City object
# Salary int64
# Column selection
print(df['Name'])
# Output: Series with names
# 0 Alice
# 1 Bob
# 2 Charlie
# 3 Diana
print(df[['Name', 'Salary']])
# Output: DataFrame with 2 columns
# Row selection
print(df.iloc[0])
# Output: First row as Series
# Name Alice
# Age 25
# City NYC
# Salary 70000
print(df.loc[0:2])
# Output: DataFrame with rows 0-2
# Conditional selection
print(df[df['Age'] > 30])
# Output: Rows where Age > 30
print(df[(df['Age'] > 25) &
(df['Salary'] > 75000)])
# Output: Multiple conditions
# Using query
print(df.query('Age > 30 and Salary < 80000'))
# Output: String-based filtering
# Add new column
df['Bonus'] = df['Salary'] * 0.1
print(df['Bonus'].head(2))
# Output:
# 0 7000.0
# 1 8000.0
# Modify existing column
df['Salary'] = df['Salary'] * 1.05
# Sort data
df_sorted = df.sort_values('Salary',
ascending=False)
print(df_sorted[['Name', 'Salary']].head(2))
# Output:
# Name Salary
# 3 Diana 94500.00
# 1 Bob 84000.00
# Group by operations
df_grouped = df.groupby('City')['Salary'].mean()
print(df_grouped)
# Output:
# City
# London 78750.0
# NYC 73500.0
# Paris 84000.0
# Tokyo 94500.0
# Handling missing data
df_clean = df.dropna() # Drop NaN rows
df_filled = df.fillna(0) # Fill with 0
df.fillna(method='ffill') # Forward fill
# Basic statistics
print(df.describe())
# Output:
# Age Salary
# count 4.0000 4.000000
# mean 29.5000 78750.000000
# std 4.1231 8539.125638
# min 25.0000 70000.000000
# 25% 27.2500 73750.000000
# 50% 29.0000 77500.000000
# 75% 31.2500 82500.000000
# max 35.0000 90000.000000
# Single column stats
print(f"Mean: {df['Salary'].mean():.2f}")
# Output: Mean: 78750.00
# Group by aggregation
grouped = df.groupby('City').agg({
'Salary': ['mean', 'max', 'min'],
'Age': 'mean'
})
print(grouped)
# Output: Multi-level column DataFrame
# Value counts
print(df['City'].value_counts())
# Output:
# NYC 1
# Paris 1
# London 1
# Tokyo 1
import matplotlib.pyplot as plt
import numpy as np
# Generate data
np.random.seed(42)
x = np.random.randn(100)
y = 2 * x + np.random.randn(100) * 0.5
# Create scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(x, y, alpha=0.6, c=x,
cmap='viridis', s=50)
# Labels and title
plt.xlabel('X Variable')
plt.ylabel('Y Variable')
plt.title('Scatter Plot Example')
# Add grid
plt.grid(True, alpha=0.3)
# Add colorbar
plt.colorbar(label='X Value')
# Show plot
plt.show()
# Correlation coefficient
corr = np.corrcoef(x, y)[0, 1]
print(f"Correlation: {corr:.3f}")
# Output: Correlation: 0.964
# Generate data
data = np.random.normal(100, 15, 1000)
# Create histogram
plt.figure(figsize=(10, 6))
counts, bins, patches = plt.hist(
data, bins=30,
edgecolor='black',
alpha=0.7,
color='steelblue'
)
# Add vertical line at mean
mean_val = data.mean()
plt.axvline(mean_val, color='red',
linestyle='--', linewidth=2,
label=f'Mean: {mean_val:.1f}')
# Labels and title
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Distribution of Values')
# Add legend and grid
plt.legend()
plt.grid(True, alpha=0.3, axis='y')
plt.show()
# Print statistics
print(f"Mean: {data.mean():.2f}")
print(f"Std Dev: {data.std():.2f}")
print(f"Min: {data.min():.2f}")
print(f"Max: {data.max():.2f}")
# Output:
# Mean: 100.12
# Std Dev: 14.87
# Min: 52.84
# Max: 143.56
# Create data
categories = ['Product A', 'Product B',
'Product C', 'Product D']
values = [23, 45, 56, 78]
# Vertical bar chart
plt.figure(figsize=(10, 6))
bars = plt.bar(categories, values,
color='coral',
edgecolor='black',
alpha=0.7)
# Add value labels on bars
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2.,
height, f'{height}',
ha='center', va='bottom')
# Labels and title
plt.xlabel('Products')
plt.ylabel('Sales (in thousands)')
plt.title('Bar Chart Example')
# Add grid
plt.grid(True, alpha=0.3, axis='y')
plt.show()
# Calculate total and average
total = sum(values)
avg = total / len(values)
print(f"Total Sales: {total}k")
print(f"Average: {avg:.1f}k")
# Output:
# Total Sales: 202k
# Average: 50.5k
fig, axes = plt.subplots(2, 2,
figsize=(12, 10))
# Scatter plot (top-left)
axes[0, 0].scatter(x, y, alpha=0.6)
axes[0, 0].set_title('Scatter Plot')
axes[0, 0].set_xlabel('X')
axes[0, 0].set_ylabel('Y')
# Histogram (top-right)
axes[0, 1].hist(data, bins=20,
color='green', alpha=0.7)
axes[0, 1].set_title('Histogram')
axes[0, 1].set_xlabel('Value')
axes[0, 1].set_ylabel('Frequency')
# Bar chart (bottom-left)
axes[1, 0].bar(categories, values,
color='orange')
axes[1, 0].set_title('Bar Chart')
axes[1, 0].set_xlabel('Category')
# Line plot (bottom-right)
x_line = np.linspace(0, 10, 100)
y_line = np.sin(x_line)
axes[1, 1].plot(x_line, y_line, 'r-')
axes[1, 1].set_title('Line Plot')
# Overall title
plt.suptitle('Dashboard', fontsize=14)
plt.tight_layout()
plt.show()
Complete the Week 2 Jupyter Notebooks:
Essential Skills for Machine Learning:
Remember: These tools form the foundation for all machine learning work in Python!
Week 3 Preview:
Keep practicing with the notebooks!