머신러닝 프로젝트

from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path('datasets/housing.tgz')
    if not tarball_path.is_file():
        Path('datasets').mkdir(parents=True, exist_ok=True)
        url = 'https://github.com/ageron/data/raw/main/housing.tgz'
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path='datasets')
    return pd.read_csv(Path('datasets/housing/housing.csv'))

housing = load_housing_data()

housing.head()

housing.info()

housing['ocean_proximity'].value_counts()

housing.describe()

housing.hist(bins=50, figsize=(12, 8))

# 테스트 세트 만들기

import numpy as np

def shuffle_and_split_data(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

train_set, test_set = shuffle_and_split_data(housing, 0.2)

print(len(train_set))
print(len(test_set))

from zlib import crc32

def is_id_in_test_set(identifier, test_ratio):
    return crc32(np.int64(identifier)) < test_ratio * 2**32

def split_data_with_id_hash(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: is_id_in_test_set(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

housing_with_id = housing.reset_index()    # index 열이 추가된 데이터프레임이 반환됨

train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2, 'index')

housing_with_id['id'] = housing['longitude'] * 1000 + housing['latitude']
train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2, 'id')



from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

train_set.head()

import matplotlib.pyplot as plt

housing['income_cat'] = pd.cut(housing['median_income'],
                               bins=[0., 1.5, 3.0, 4.5, 6, np.inf],
                               labels=[1, 2, 3, 4, 5])

housing['income_cat'].value_counts().sort_index().plot.bar(rot=0, grid=True)
plt.xlabel('Income Category')
plt.ylabel('Number of districts')

 C:\Users\\Desktop\project\jupyter_notebook\ml 디렉터리

2023-12-06  오후 02:27    <DIR>          .
2023-12-06  오후 02:24    <DIR>          ..
2023-12-06  오후 02:27    <DIR>          .ipynb_checkpoints
2023-12-06  오후 02:25    <DIR>          datasets
2023-12-06  오후 02:27           176,356 머신러닝 개념.ipynb
               1개 파일             176,356 바이트
               4개 디렉터리  159,471,996,928 바이트 남음

C:\Users\\Desktop\project\jupyter_notebook\ml>cd datasets

C:\Users\\Desktop\project\jupyter_notebook\ml\datasets>dir
 C 드라이브의 볼륨에는 이름이 없습니다.
 볼륨 일련 번호: 52A9-0E40

 C:\Users\\Desktop\project\jupyter_notebook\ml\datasets 디렉터리

2023-12-06  오후 02:25    <DIR>          .
2023-12-06  오후 02:27    <DIR>          ..
2022-02-21  오전 05:34    <DIR>          housing
2023-12-06  오후 02:25           449,115 housing.tgz
               1개 파일             449,115 바이트
               3개 디렉터리  159,464,611,840 바이트 남음

C:\Users\\Desktop\project\jupyter_notebook\ml\datasets>tar -zxvf housing.tgz
x housing/
x housing/housing.csv

C:\Users\\Desktop\project\jupyter_notebook\ml\datasets>tar
tar: Must specify one of -c, -r, -t, -u, -x

압축 풀림

저작자표시

'머신러닝' 카테고리의 다른 글

보스턴 주택 가격 회귀 (0)	2024.01.26
사용자 행동 인식 데이터 세트 (0)	2024.01.17
피마 인디언 당뇨병 예측 (0)	2024.01.16
타이타닉 생존자 예측 (0)	2024.01.15
머신러닝 개념 (0)	2023.12.04

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

인기포스트

ABOUT ME

개발자 어쩌구 개발자 어쩌구

'머신러닝' 카테고리의 다른 글

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

인기포스트

ABOUT ME

'머신러닝' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역