AHandsomePython
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎app/__init__.py
+30 b/‎app/__init__.py
+30
diff --git a/‎app/calculate_coco_features.py
+90 b/‎app/calculate_coco_features.py
+90
diff --git a/‎app/caption.py
+98 b/‎app/caption.py
+98
@@ -99,4 +99,4 @@ If you find our paper and/or code helpful, please consider citing:
   year={2024},
   organization={IEEE}
 }
-```
+```
@@ -0,0 +1,30 @@
+"""
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from PIL import Image
+import requests
+
+import streamlit as st
+import torch
+
+
+@st.cache()
+def load_demo_image():
+    img_url = (
+        "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
+    )
+    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+    return raw_image
+
+# lyz modifies cuda
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# cache_root = "/export/home/.cache/lavis/"
+
+cache_root = "/data/xcg/lavis_data/.cache/"
+
@@ -0,0 +1,90 @@
+"""
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from PIL import Image
+import requests
+import torch
+
+import os
+
+from lavis.common.registry import registry
+from lavis.processors import *
+from lavis.models import *
+from lavis.common.utils import build_default_model
+
+# lyz modifies cuda
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def load_demo_image():
+    img_url = (
+        "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
+    )
+    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+
+    return raw_image
+
+
+def read_img(filepath):
+    raw_image = Image.open(filepath).convert("RGB")
+
+    return raw_image
+
+
+# model
+model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base.pth"
+feature_extractor = BlipFeatureExtractor(pretrained=model_url)
+
+feature_extractor.eval()
+feature_extractor = feature_extractor.to(device)
+
+# preprocessors
+vis_processor = BlipImageEvalProcessor(image_size=224)
+text_processor = BlipCaptionProcessor()
+
+# files to process
+# file_root = "/export/home/.cache/lavis/coco/images/val2014"
+# file_root = "/export/home/.cache/lavis/coco/images/train2014"
+file_root = "/data/xcg/lavis_data/coco/images/train2014"
+filepaths = os.listdir(file_root)
+
+print(len(filepaths))
+
+caption = "dummy"
+
+path2feat = dict()
+bsz = 256
+
+images_in_batch = []
+filepaths_in_batch = []
+
+for i, filename in enumerate(filepaths):
+    if i % bsz == 0 and i > 0:
+        images_in_batch = torch.cat(images_in_batch, dim=0).to(device)
+        with torch.no_grad():
+            image_features = feature_extractor(
+                images_in_batch, caption, mode="image", normalized=True
+            )[:, 0]
+
+        for filepath, image_feat in zip(filepaths_in_batch, image_features):
+            path2feat[os.path.basename(filepath)] = image_feat.detach().cpu()
+
+        images_in_batch = []
+        filepaths_in_batch = []
+
+        print(len(path2feat), image_features.shape)
+    else:
+        filepath = os.path.join(file_root, filename)
+
+        image = read_img(filepath)
+        image = vis_processor(image).unsqueeze(0)
+
+        images_in_batch.append(image)
+        filepaths_in_batch.append(filepath)
+
+torch.save(path2feat, "path2feat_coco_train2014.pth")
@@ -0,0 +1,98 @@
+"""
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import streamlit as st
+from app import device, load_demo_image
+from app.utils import load_model_cache
+from lavis.processors import load_processor
+from PIL import Image
+
+
+def app():
+    # ===== layout =====
+    model_type = st.sidebar.selectbox("Model:", ["BLIP_base", "BLIP_large"])
+
+    sampling_method = st.sidebar.selectbox(
+        "Sampling method:", ["Beam search", "Nucleus sampling"]
+    )
+
+    st.markdown(
+        "<h1 style='text-align: center;'>Image Description Generation</h1>",
+        unsafe_allow_html=True,
+    )
+
+    instructions = """Try the provided image or upload your own:"""
+    file = st.file_uploader(instructions)
+
+    use_beam = sampling_method == "Beam search"
+
+    col1, col2 = st.columns(2)
+
+    if file:
+        raw_img = Image.open(file).convert("RGB")
+    else:
+        raw_img = load_demo_image()
+
+    col1.header("Image")
+
+    w, h = raw_img.size
+    scaling_factor = 720 / w
+    resized_image = raw_img.resize((int(w * scaling_factor), int(h * scaling_factor)))
+
+    col1.image(resized_image, use_column_width=True)
+    col2.header("Description")
+
+    cap_button = st.button("Generate")
+
+    # ==== event ====
+    vis_processor = load_processor("blip_image_eval").build(image_size=384)
+
+    if cap_button:
+        if model_type.startswith("BLIP"):
+            blip_type = model_type.split("_")[1].lower()
+            model = load_model_cache(
+                "blip_caption",
+                model_type=f"{blip_type}_coco",
+                is_eval=True,
+                device=device,
+            )
+
+        img = vis_processor(raw_img).unsqueeze(0).to(device)
+        captions = generate_caption(
+            model=model, image=img, use_nucleus_sampling=not use_beam
+        )
+
+        col2.write("\n\n".join(captions), use_column_width=True)
+
+
+def generate_caption(
+    model, image, use_nucleus_sampling=False, num_beams=3, max_length=40, min_length=5
+):
+    samples = {"image": image}
+
+    captions = []
+    if use_nucleus_sampling:
+        for _ in range(5):
+            caption = model.generate(
+                samples,
+                use_nucleus_sampling=True,
+                max_length=max_length,
+                min_length=min_length,
+                top_p=0.9,
+            )
+            captions.append(caption[0])
+    else:
+        caption = model.generate(
+            samples,
+            use_nucleus_sampling=False,
+            num_beams=num_beams,
+            max_length=max_length,
+            min_length=min_length,
+        )
+        captions.append(caption[0])
+
+    return captions
Original file line number	Diff line number	Diff line change
`@@ -99,4 +99,4 @@ If you find our paper and/or code helpful, please consider citing:`
`99`	`99`	`year={2024},`
`100`	`100`	`organization={IEEE}`
`101`	`101`	`}`
`102`		-```
	`102`	+```